sky2 / scripts /reproduce /math.sh
JustinTX's picture
Add files using upload-large-folder tool
af83196 verified
#!/usr/bin/env bash
# Reproduce math benchmarks (17 problems x 2 search methods).
# All benchmarks launch in parallel.
set -euo pipefail
# ── Settings ─────────────────────────────────────────────────────────────────
# Only two things to change:
MODEL="gpt-5" # main generation model
# MODEL="gemini/gemini-3.0-pro-preview" # alternative
ITERATIONS=100
# -m sets all models (main + guide/paradigm) to the same MODEL.
# API keys: export OPENAI_API_KEY="sk-..." (and/or GEMINI_API_KEY for Gemini)
# ── Install ──────────────────────────────────────────────────────────────────
cd "$(dirname "$0")/../.."
uv sync --extra math
# ── Helper ───────────────────────────────────────────────────────────────────
run() {
local dir=$1 search=$2
local init="$dir/initial_program.py"
[[ -f "$dir/initial_program.cpp" ]] && init="$dir/initial_program.cpp"
[[ -f "$dir/initial_prompt.txt" ]] && init="$dir/initial_prompt.txt"
local cfg="$dir/config.yaml"
[[ -f "$dir/config_${search}.yaml" ]] && cfg="$dir/config_${search}.yaml"
echo "== $search: ${dir#benchmarks/} =="
uv run skydiscover-run "$init" "$dir/evaluator.py" \
-c "$cfg" -s "$search" -m "$MODEL" -i "$ITERATIONS" \
-o "outputs/reproduce/$search/${dir#benchmarks/}"
}
# ── AdaEvolve ────────────────────────────────────────────────────────────────
run benchmarks/math/circle_packing adaevolve &
run benchmarks/math/circle_packing_rect adaevolve &
run benchmarks/math/erdos_min_overlap adaevolve &
run benchmarks/math/first_autocorr_ineq adaevolve &
run benchmarks/math/second_autocorr_ineq adaevolve &
run benchmarks/math/third_autocorr_ineq adaevolve &
run benchmarks/math/uncertainty_ineq adaevolve &
run benchmarks/math/hexagon_packing/11 adaevolve &
run benchmarks/math/hexagon_packing/12 adaevolve &
run benchmarks/math/heilbronn_convex/13 adaevolve &
run benchmarks/math/heilbronn_convex/14 adaevolve &
run benchmarks/math/heilbronn_triangle adaevolve &
run benchmarks/math/minimizing_max_min_dist/2 adaevolve &
run benchmarks/math/minimizing_max_min_dist/3 adaevolve &
run benchmarks/math/matmul adaevolve &
run benchmarks/math/signal_processing adaevolve &
run benchmarks/math/sums_diffs_finite_sets adaevolve &
# ── EvoX ─────────────────────────────────────────────────────────────────────
run benchmarks/math/circle_packing evox &
run benchmarks/math/circle_packing_rect evox &
run benchmarks/math/erdos_min_overlap evox &
run benchmarks/math/first_autocorr_ineq evox &
run benchmarks/math/second_autocorr_ineq evox &
run benchmarks/math/third_autocorr_ineq evox &
run benchmarks/math/uncertainty_ineq evox &
run benchmarks/math/hexagon_packing/11 evox &
run benchmarks/math/hexagon_packing/12 evox &
run benchmarks/math/heilbronn_convex/13 evox &
run benchmarks/math/heilbronn_convex/14 evox &
run benchmarks/math/heilbronn_triangle evox &
run benchmarks/math/minimizing_max_min_dist/2 evox &
run benchmarks/math/minimizing_max_min_dist/3 evox &
run benchmarks/math/matmul evox &
run benchmarks/math/signal_processing evox &
run benchmarks/math/sums_diffs_finite_sets evox &
wait
echo "math.sh: all 34 runs finished."