| """Primary evaluator for the AlphaEvolve AC2 task.""" |
|
|
| import argparse |
| import sys |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| import numpy as np |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parents[2])) |
|
|
| from shinka.core import run_shinka_eval |
|
|
|
|
| def evaluate_sequence(sequence: list[float]) -> float: |
| """Evaluate sequence. Higher is better.""" |
| if not isinstance(sequence, list): |
| raise ValueError("Invalid sequence type") |
| if not sequence: |
| raise ValueError("Empty sequence") |
|
|
| for x in sequence: |
| if isinstance(x, bool) or not isinstance(x, (int, float)): |
| raise ValueError("Invalid sequence element type") |
| if np.isnan(x) or np.isinf(x): |
| raise ValueError("Invalid sequence element value") |
|
|
| sequence = [float(x) for x in sequence] |
| sequence = [max(0.0, x) for x in sequence] |
| if np.sum(sequence) < 0.01: |
| raise ValueError("Sum of sequence is too close to zero") |
| sequence = [min(1000.0, x) for x in sequence] |
|
|
| convolution_2 = np.convolve(sequence, sequence) |
| num_points = len(convolution_2) |
| x_points = np.linspace(-0.5, 0.5, num_points + 2) |
| x_intervals = np.diff(x_points) |
| y_points = np.concatenate(([0.0], convolution_2, [0.0])) |
|
|
| l2_norm_squared = 0.0 |
| for i in range(len(convolution_2) + 1): |
| y1 = y_points[i] |
| y2 = y_points[i + 1] |
| h = x_intervals[i] |
| l2_norm_squared += (h / 3.0) * (y1**2 + y1 * y2 + y2**2) |
|
|
| norm_1 = np.sum(np.abs(convolution_2)) / (len(convolution_2) + 1) |
| norm_inf = np.max(np.abs(convolution_2)) |
| if norm_1 <= 0.0 or norm_inf <= 0.0: |
| raise ValueError("Degenerate convolution norms") |
|
|
| return float(l2_norm_squared / (norm_1 * norm_inf)) |
|
|
|
|
| def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]: |
| """Validate run output structure and objective computability.""" |
| try: |
| if not isinstance(run_output, list): |
| return False, "run() must return list[float]" |
| if len(run_output) == 0: |
| return False, "run() returned empty list" |
| value = evaluate_sequence(run_output) |
| if not np.isfinite(value): |
| return False, "Objective is inf/nan" |
| return True, None |
| except Exception as exc: |
| return False, str(exc) |
|
|
|
|
| def aggregate_alphaevolve_ac2_metrics(results: List[list[float]]) -> Dict[str, Any]: |
| """Aggregate metrics using best objective value as ranking signal.""" |
| if not results: |
| return { |
| "combined_score": 0.0, |
| "public": {"best_value": None, "num_runs": 0}, |
| "private": {"all_values": []}, |
| "text_feedback": "No successful runs.", |
| } |
|
|
| values: List[float] = [] |
| lengths: List[int] = [] |
| best_sequence: Optional[list[float]] = None |
| best_value = -float(np.inf) |
|
|
| for seq in results: |
| val = evaluate_sequence(seq) |
| values.append(float(val)) |
| lengths.append(len(seq)) |
| if val > best_value: |
| best_value = float(val) |
| best_sequence = seq |
|
|
| |
| combined_score = best_value |
| public = { |
| "best_value": best_value, |
| "best_length": len(best_sequence) if best_sequence is not None else None, |
| "num_runs": len(results), |
| } |
| private = { |
| "all_values": values, |
| "all_lengths": lengths, |
| } |
|
|
| return { |
| "combined_score": combined_score, |
| "public": public, |
| "private": private, |
| "text_feedback": ( |
| "Higher evaluate_sequence value is better. " |
| "combined_score = best_value." |
| ), |
| } |
|
|
|
|
| def main(program_path: str, results_dir: str, num_experiment_runs: int = 1): |
| print(f"Evaluating program: {program_path}") |
| print(f"Saving results to: {results_dir}") |
| print(f"Number of runs: {num_experiment_runs}") |
|
|
| metrics, correct, error = run_shinka_eval( |
| program_path=program_path, |
| results_dir=results_dir, |
| experiment_fn_name="run", |
| num_runs=num_experiment_runs, |
| validate_fn=validate_run_output, |
| aggregate_metrics_fn=aggregate_alphaevolve_ac2_metrics, |
| ) |
|
|
| if correct: |
| print("Evaluation completed successfully.") |
| else: |
| print(f"Evaluation failed: {error}") |
| print(f"combined_score={metrics.get('combined_score')}") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC2 task") |
| parser.add_argument( |
| "--program_path", |
| type=str, |
| default="tasks/alphaevolve_ac2/initial.py", |
| ) |
| parser.add_argument( |
| "--results_dir", |
| type=str, |
| default="tasks/alphaevolve_ac2/results/debug_eval", |
| ) |
| parser.add_argument( |
| "--num_experiment_runs", |
| type=int, |
| default=1, |
| ) |
| args = parser.parse_args() |
| main( |
| program_path=args.program_path, |
| results_dir=args.results_dir, |
| num_experiment_runs=args.num_experiment_runs, |
| ) |
|
|
|
|