"""Primary evaluator for the AlphaEvolve AC2 task.""" import argparse import sys from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import numpy as np # Add project root to import path for direct execution. sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from shinka.core import run_shinka_eval def evaluate_sequence(sequence: list[float]) -> float: """Evaluate sequence. Higher is better.""" if not isinstance(sequence, list): raise ValueError("Invalid sequence type") if not sequence: raise ValueError("Empty sequence") for x in sequence: if isinstance(x, bool) or not isinstance(x, (int, float)): raise ValueError("Invalid sequence element type") if np.isnan(x) or np.isinf(x): raise ValueError("Invalid sequence element value") sequence = [float(x) for x in sequence] sequence = [max(0.0, x) for x in sequence] if np.sum(sequence) < 0.01: raise ValueError("Sum of sequence is too close to zero") sequence = [min(1000.0, x) for x in sequence] convolution_2 = np.convolve(sequence, sequence) num_points = len(convolution_2) x_points = np.linspace(-0.5, 0.5, num_points + 2) x_intervals = np.diff(x_points) y_points = np.concatenate(([0.0], convolution_2, [0.0])) l2_norm_squared = 0.0 for i in range(len(convolution_2) + 1): y1 = y_points[i] y2 = y_points[i + 1] h = x_intervals[i] l2_norm_squared += (h / 3.0) * (y1**2 + y1 * y2 + y2**2) norm_1 = np.sum(np.abs(convolution_2)) / (len(convolution_2) + 1) norm_inf = np.max(np.abs(convolution_2)) if norm_1 <= 0.0 or norm_inf <= 0.0: raise ValueError("Degenerate convolution norms") return float(l2_norm_squared / (norm_1 * norm_inf)) def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]: """Validate run output structure and objective computability.""" try: if not isinstance(run_output, list): return False, "run() must return list[float]" if len(run_output) == 0: return False, "run() returned empty list" value = evaluate_sequence(run_output) if not np.isfinite(value): return False, "Objective is inf/nan" return True, None except Exception as exc: return False, str(exc) def aggregate_alphaevolve_ac2_metrics(results: List[list[float]]) -> Dict[str, Any]: """Aggregate metrics using best objective value as ranking signal.""" if not results: return { "combined_score": 0.0, "public": {"best_value": None, "num_runs": 0}, "private": {"all_values": []}, "text_feedback": "No successful runs.", } values: List[float] = [] lengths: List[int] = [] best_sequence: Optional[list[float]] = None best_value = -float(np.inf) for seq in results: val = evaluate_sequence(seq) values.append(float(val)) lengths.append(len(seq)) if val > best_value: best_value = float(val) best_sequence = seq # Maximization task: combined score is best value directly. combined_score = best_value public = { "best_value": best_value, "best_length": len(best_sequence) if best_sequence is not None else None, "num_runs": len(results), } private = { "all_values": values, "all_lengths": lengths, } return { "combined_score": combined_score, "public": public, "private": private, "text_feedback": ( "Higher evaluate_sequence value is better. " "combined_score = best_value." ), } def main(program_path: str, results_dir: str, num_experiment_runs: int = 1): print(f"Evaluating program: {program_path}") print(f"Saving results to: {results_dir}") print(f"Number of runs: {num_experiment_runs}") metrics, correct, error = run_shinka_eval( program_path=program_path, results_dir=results_dir, experiment_fn_name="run", num_runs=num_experiment_runs, validate_fn=validate_run_output, aggregate_metrics_fn=aggregate_alphaevolve_ac2_metrics, ) if correct: print("Evaluation completed successfully.") else: print(f"Evaluation failed: {error}") print(f"combined_score={metrics.get('combined_score')}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC2 task") parser.add_argument( "--program_path", type=str, default="tasks/alphaevolve_ac2/initial.py", ) parser.add_argument( "--results_dir", type=str, default="tasks/alphaevolve_ac2/results/debug_eval", ) parser.add_argument( "--num_experiment_runs", type=int, default=1, ) args = parser.parse_args() main( program_path=args.program_path, results_dir=args.results_dir, num_experiment_runs=args.num_experiment_runs, )