shinka-backup / tasks /alphaevolve_ac2 /evaluate_ori.py
JustinTX's picture
Add files using upload-large-folder tool
40607c3 verified
"""Primary evaluator for the AlphaEvolve AC2 task."""
import argparse
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
# Add project root to import path for direct execution.
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from shinka.core import run_shinka_eval
def evaluate_sequence(sequence: list[float]) -> float:
"""Evaluate sequence. Higher is better."""
if not isinstance(sequence, list):
raise ValueError("Invalid sequence type")
if not sequence:
raise ValueError("Empty sequence")
for x in sequence:
if isinstance(x, bool) or not isinstance(x, (int, float)):
raise ValueError("Invalid sequence element type")
if np.isnan(x) or np.isinf(x):
raise ValueError("Invalid sequence element value")
sequence = [float(x) for x in sequence]
sequence = [max(0.0, x) for x in sequence]
if np.sum(sequence) < 0.01:
raise ValueError("Sum of sequence is too close to zero")
sequence = [min(1000.0, x) for x in sequence]
convolution_2 = np.convolve(sequence, sequence)
num_points = len(convolution_2)
x_points = np.linspace(-0.5, 0.5, num_points + 2)
x_intervals = np.diff(x_points)
y_points = np.concatenate(([0.0], convolution_2, [0.0]))
l2_norm_squared = 0.0
for i in range(len(convolution_2) + 1):
y1 = y_points[i]
y2 = y_points[i + 1]
h = x_intervals[i]
l2_norm_squared += (h / 3.0) * (y1**2 + y1 * y2 + y2**2)
norm_1 = np.sum(np.abs(convolution_2)) / (len(convolution_2) + 1)
norm_inf = np.max(np.abs(convolution_2))
if norm_1 <= 0.0 or norm_inf <= 0.0:
raise ValueError("Degenerate convolution norms")
return float(l2_norm_squared / (norm_1 * norm_inf))
def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]:
"""Validate run output structure and objective computability."""
try:
if not isinstance(run_output, list):
return False, "run() must return list[float]"
if len(run_output) == 0:
return False, "run() returned empty list"
value = evaluate_sequence(run_output)
if not np.isfinite(value):
return False, "Objective is inf/nan"
return True, None
except Exception as exc:
return False, str(exc)
def aggregate_alphaevolve_ac2_metrics(results: List[list[float]]) -> Dict[str, Any]:
"""Aggregate metrics using best objective value as ranking signal."""
if not results:
return {
"combined_score": 0.0,
"public": {"best_value": None, "num_runs": 0},
"private": {"all_values": []},
"text_feedback": "No successful runs.",
}
values: List[float] = []
lengths: List[int] = []
best_sequence: Optional[list[float]] = None
best_value = -float(np.inf)
for seq in results:
val = evaluate_sequence(seq)
values.append(float(val))
lengths.append(len(seq))
if val > best_value:
best_value = float(val)
best_sequence = seq
# Maximization task: combined score is best value directly.
combined_score = best_value
public = {
"best_value": best_value,
"best_length": len(best_sequence) if best_sequence is not None else None,
"num_runs": len(results),
}
private = {
"all_values": values,
"all_lengths": lengths,
}
return {
"combined_score": combined_score,
"public": public,
"private": private,
"text_feedback": (
"Higher evaluate_sequence value is better. "
"combined_score = best_value."
),
}
def main(program_path: str, results_dir: str, num_experiment_runs: int = 1):
print(f"Evaluating program: {program_path}")
print(f"Saving results to: {results_dir}")
print(f"Number of runs: {num_experiment_runs}")
metrics, correct, error = run_shinka_eval(
program_path=program_path,
results_dir=results_dir,
experiment_fn_name="run",
num_runs=num_experiment_runs,
validate_fn=validate_run_output,
aggregate_metrics_fn=aggregate_alphaevolve_ac2_metrics,
)
if correct:
print("Evaluation completed successfully.")
else:
print(f"Evaluation failed: {error}")
print(f"combined_score={metrics.get('combined_score')}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC2 task")
parser.add_argument(
"--program_path",
type=str,
default="tasks/alphaevolve_ac2/initial.py",
)
parser.add_argument(
"--results_dir",
type=str,
default="tasks/alphaevolve_ac2/results/debug_eval",
)
parser.add_argument(
"--num_experiment_runs",
type=int,
default=1,
)
args = parser.parse_args()
main(
program_path=args.program_path,
results_dir=args.results_dir,
num_experiment_runs=args.num_experiment_runs,
)