| """Primary evaluator for the AlphaEvolve AC task.""" |
|
|
| import argparse |
| import sys |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| import numpy as np |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parents[2])) |
|
|
| from shinka.core import run_shinka_eval |
|
|
|
|
| def evaluate_sequence(sequence: list[float]) -> float: |
| """ |
| Evaluate coefficient sequence with security/validity checks. |
| Returns np.inf for invalid inputs. |
| Lower is better. |
| """ |
| if not isinstance(sequence, list): |
| return float(np.inf) |
| if not sequence: |
| return float(np.inf) |
|
|
| for x in sequence: |
| if isinstance(x, bool) or not isinstance(x, (int, float)): |
| return float(np.inf) |
| if np.isnan(x) or np.isinf(x): |
| return float(np.inf) |
|
|
| sequence = [float(x) for x in sequence] |
| sequence = [max(0.0, x) for x in sequence] |
| sequence = [min(1000.0, x) for x in sequence] |
|
|
| n = len(sequence) |
| b_sequence = np.convolve(sequence, sequence) |
| max_b = float(np.max(b_sequence)) |
| sum_a = float(np.sum(sequence)) |
| if sum_a < 0.01: |
| return float(np.inf) |
|
|
| return float(2.0 * n * max_b / (sum_a**2)) |
|
|
|
|
| def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]: |
| """Validate output of run().""" |
| try: |
| if not isinstance(run_output, list): |
| return False, "run() must return list[float]" |
| if len(run_output) == 0: |
| return False, "run() returned empty list" |
| value = evaluate_sequence(run_output) |
| if not np.isfinite(value): |
| return False, "evaluate_sequence returned inf/nan" |
| return True, None |
| except Exception as exc: |
| return False, str(exc) |
|
|
|
|
| def aggregate_alphaevolve_ac_metrics(results: List[list[float]]) -> Dict[str, Any]: |
| """Aggregate metrics with best-only ranking.""" |
| if not results: |
| return { |
| "combined_score": 0.0, |
| "public": {"best_value": None, "num_runs": 0}, |
| "private": {"all_values": []}, |
| "text_feedback": "No successful runs.", |
| } |
|
|
| values: List[float] = [] |
| lengths: List[int] = [] |
| best_sequence: Optional[list[float]] = None |
| best_value = float(np.inf) |
|
|
| for seq in results: |
| val = evaluate_sequence(seq) |
| values.append(float(val)) |
| lengths.append(len(seq)) |
| if val < best_value: |
| best_value = float(val) |
| best_sequence = seq |
|
|
| combined_score = -best_value |
| public = { |
| "best_value": best_value, |
| "best_length": len(best_sequence) if best_sequence is not None else None, |
| "num_runs": len(results), |
| } |
| private = { |
| "all_values": values, |
| "all_lengths": lengths, |
| } |
|
|
| return { |
| "combined_score": combined_score, |
| "public": public, |
| "private": private, |
| "text_feedback": ( |
| "Lower evaluate_sequence value is better. " |
| "combined_score = -best_value." |
| ), |
| } |
|
|
|
|
| def main(program_path: str, results_dir: str, num_experiment_runs: int = 1): |
| """Run evaluation and persist metrics.json/correct.json.""" |
| print(f"Evaluating program: {program_path}") |
| print(f"Saving results to: {results_dir}") |
| print(f"Number of runs: {num_experiment_runs}") |
|
|
| metrics, correct, error = run_shinka_eval( |
| program_path=program_path, |
| results_dir=results_dir, |
| experiment_fn_name="run", |
| num_runs=num_experiment_runs, |
| validate_fn=validate_run_output, |
| aggregate_metrics_fn=aggregate_alphaevolve_ac_metrics, |
| ) |
|
|
| if correct: |
| print("Evaluation completed successfully.") |
| else: |
| print(f"Evaluation failed: {error}") |
| print(f"combined_score={metrics.get('combined_score')}") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC task") |
| parser.add_argument( |
| "--program_path", |
| type=str, |
| default="tasks/alphaevolve_ac/initial.py", |
| ) |
| parser.add_argument( |
| "--results_dir", |
| type=str, |
| default="tasks/alphaevolve_ac/results/debug_eval", |
| ) |
| parser.add_argument( |
| "--num_experiment_runs", |
| type=int, |
| default=1, |
| ) |
| args = parser.parse_args() |
| main( |
| program_path=args.program_path, |
| results_dir=args.results_dir, |
| num_experiment_runs=args.num_experiment_runs, |
| ) |
|
|
|
|