shinka-backup / tasks /alphaevolve_ac /evaluate_ori.py
JustinTX's picture
Add files using upload-large-folder tool
40607c3 verified
"""Primary evaluator for the AlphaEvolve AC task."""
import argparse
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
# Add project root to import path for direct file execution.
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from shinka.core import run_shinka_eval
def evaluate_sequence(sequence: list[float]) -> float:
"""
Evaluate coefficient sequence with security/validity checks.
Returns np.inf for invalid inputs.
Lower is better.
"""
if not isinstance(sequence, list):
return float(np.inf)
if not sequence:
return float(np.inf)
for x in sequence:
if isinstance(x, bool) or not isinstance(x, (int, float)):
return float(np.inf)
if np.isnan(x) or np.isinf(x):
return float(np.inf)
sequence = [float(x) for x in sequence]
sequence = [max(0.0, x) for x in sequence]
sequence = [min(1000.0, x) for x in sequence]
n = len(sequence)
b_sequence = np.convolve(sequence, sequence)
max_b = float(np.max(b_sequence))
sum_a = float(np.sum(sequence))
if sum_a < 0.01:
return float(np.inf)
return float(2.0 * n * max_b / (sum_a**2))
def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]:
"""Validate output of run()."""
try:
if not isinstance(run_output, list):
return False, "run() must return list[float]"
if len(run_output) == 0:
return False, "run() returned empty list"
value = evaluate_sequence(run_output)
if not np.isfinite(value):
return False, "evaluate_sequence returned inf/nan"
return True, None
except Exception as exc:
return False, str(exc)
def aggregate_alphaevolve_ac_metrics(results: List[list[float]]) -> Dict[str, Any]:
"""Aggregate metrics with best-only ranking."""
if not results:
return {
"combined_score": 0.0,
"public": {"best_value": None, "num_runs": 0},
"private": {"all_values": []},
"text_feedback": "No successful runs.",
}
values: List[float] = []
lengths: List[int] = []
best_sequence: Optional[list[float]] = None
best_value = float(np.inf)
for seq in results:
val = evaluate_sequence(seq)
values.append(float(val))
lengths.append(len(seq))
if val < best_value:
best_value = float(val)
best_sequence = seq
combined_score = -best_value
public = {
"best_value": best_value,
"best_length": len(best_sequence) if best_sequence is not None else None,
"num_runs": len(results),
}
private = {
"all_values": values,
"all_lengths": lengths,
}
return {
"combined_score": combined_score,
"public": public,
"private": private,
"text_feedback": (
"Lower evaluate_sequence value is better. "
"combined_score = -best_value."
),
}
def main(program_path: str, results_dir: str, num_experiment_runs: int = 1):
"""Run evaluation and persist metrics.json/correct.json."""
print(f"Evaluating program: {program_path}")
print(f"Saving results to: {results_dir}")
print(f"Number of runs: {num_experiment_runs}")
metrics, correct, error = run_shinka_eval(
program_path=program_path,
results_dir=results_dir,
experiment_fn_name="run",
num_runs=num_experiment_runs,
validate_fn=validate_run_output,
aggregate_metrics_fn=aggregate_alphaevolve_ac_metrics,
)
if correct:
print("Evaluation completed successfully.")
else:
print(f"Evaluation failed: {error}")
print(f"combined_score={metrics.get('combined_score')}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC task")
parser.add_argument(
"--program_path",
type=str,
default="tasks/alphaevolve_ac/initial.py",
)
parser.add_argument(
"--results_dir",
type=str,
default="tasks/alphaevolve_ac/results/debug_eval",
)
parser.add_argument(
"--num_experiment_runs",
type=int,
default=1,
)
args = parser.parse_args()
main(
program_path=args.program_path,
results_dir=args.results_dir,
num_experiment_runs=args.num_experiment_runs,
)