#!/bin/bash # Parallel vanilla run via eval service: same pipeline as agent but with # eval agent trigger interval set to 1000 (effectively never triggers). # This ensures the evaluation code path is identical to the agent run, # making the comparison fair. # # Usage: # ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh # all 172 problems, 20 parallel # ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh 50 # first 50 problems, 20 parallel # ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh 50 10 # first 50 problems, 10 parallel set -euo pipefail cd "$(dirname "$0")/../.." PYTHON=".venv/bin/python" # ============================================================================ # Configuration # ============================================================================ NUM_PROBLEMS="${1:-all}" CONCURRENCY="${2:-20}" GENS=50 SEED_MODEL="gemini3pro" LLM_MODELS="native-gemini-3-flash-preview" BASE_PORT=8860 # different port range from agent (8760) to allow concurrent runs EVAL_TRIGGER_MODE="periodic" EVAL_TRIGGER_INTERVAL=1000 # effectively never triggers eval agent TIMESTAMP="$(date +%Y%m%d_%H%M%S)" EXP_NAME="vanilla_g${GENS}" RUN_DIR="results/frontier_cs_algorithmic/${EXP_NAME}_${TIMESTAMP}" PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems" SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions" LOG_DIR="logs/frontier_cs_parallel_vanilla" mkdir -p "${LOG_DIR}" # ============================================================================ # Collect valid problem IDs (sorted numerically, take first N) # ============================================================================ ALL_PIDS=() for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then ALL_PIDS+=("${pid}") fi done if [ "${NUM_PROBLEMS}" = "all" ]; then PIDS=("${ALL_PIDS[@]}") else PIDS=("${ALL_PIDS[@]:0:${NUM_PROBLEMS}}") fi echo "========================================" echo "Frontier-CS Parallel Vanilla (eval service, no agent trigger)" echo "========================================" echo " Problems: ${#PIDS[@]} / ${#ALL_PIDS[@]} available" echo " Concurrency: ${CONCURRENCY}" echo " Generations: ${GENS}" echo " Seed model: ${SEED_MODEL}" echo " LLM: ${LLM_MODELS}" echo " Run dir: ${RUN_DIR}" echo " Eval ports: ${BASE_PORT}-$((BASE_PORT + CONCURRENCY - 1))" echo " Trigger: ${EVAL_TRIGGER_MODE} every ${EVAL_TRIGGER_INTERVAL} gens (disabled)" echo " Logs: ${LOG_DIR}/" echo "========================================" echo "" # ============================================================================ # Track all background PIDs for cleanup # ============================================================================ ALL_SERVICE_PIDS=() cleanup() { echo "" echo "Cleaning up eval services..." for spid in "${ALL_SERVICE_PIDS[@]+"${ALL_SERVICE_PIDS[@]}"}"; do kill "${spid}" 2>/dev/null || true done # Also kill any child processes jobs -p | xargs -r kill 2>/dev/null || true echo "Done." } trap cleanup EXIT INT TERM # ============================================================================ # Start/stop eval service helpers # ============================================================================ start_eval_service() { local port="$1" local log_file="${LOG_DIR}/eval_service_port_${port}.log" local url="http://localhost:${port}" # Check if already running on this port if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then echo " Eval service already running on port ${port}" return 0 fi ${PYTHON} eval_agent/ev2_service_standalone.py \ --host "0.0.0.0" --port "${port}" \ > "${log_file}" 2>&1 & local spid=$! ALL_SERVICE_PIDS+=("${spid}") # Wait for ready (up to 20s) for i in $(seq 1 20); do if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then echo " Eval service ready on port ${port} (pid=${spid})" return 0 fi sleep 1 done echo " ERROR: Eval service failed to start on port ${port}" return 1 } # ============================================================================ # Worker function: run one problem via eval service (agent never triggers) # ============================================================================ run_problem_vanilla() { local pid="$1" local port="$2" local url="http://localhost:${port}" export FRONTIER_CS_PROBLEM_ID="${pid}" export FRONTIER_CS_JUDGE_URL="http://localhost:8081" ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ --experiment-name "${EXP_NAME}" \ --problem-id "${pid}" \ --seed-model "${SEED_MODEL}" \ --num-generations "${GENS}" \ --max-parallel-jobs 1 \ --edit-backend single_shot_patch \ --llm-models ${LLM_MODELS} \ --run-dir "${RUN_DIR}" \ --use-eval-service \ --eval-service-url "${url}" \ --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \ --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \ --use-wandb \ --wandb-project frontier-cs \ --wandb-tags frontier_cs vanilla problem_${pid} \ --verbose \ > "${LOG_DIR}/problem_${pid}.log" 2>&1 local status=$? if [ ${status} -eq 0 ]; then echo "DONE problem ${pid} (port ${port})" else echo "FAIL problem ${pid} (port ${port}, see ${LOG_DIR}/problem_${pid}.log)" fi return ${status} } # ============================================================================ # Start eval services (one per concurrency slot) # ============================================================================ echo "Starting ${CONCURRENCY} eval services..." for slot in $(seq 0 $((CONCURRENCY - 1))); do port=$((BASE_PORT + slot)) start_eval_service "${port}" done echo "" # ============================================================================ # Run problems in parallel, assigning to available slots # ============================================================================ # Slot tracking: which slot is free SLOT_PIDS=() # bash PID per slot (0 = free) SLOT_PROBLEMS=() # problem ID per slot for slot in $(seq 0 $((CONCURRENCY - 1))); do SLOT_PIDS+=(0) SLOT_PROBLEMS+=("") done DONE=0 FAILED=0 TOTAL=${#PIDS[@]} IDX=0 while [ ${IDX} -lt ${TOTAL} ] || [ "$(echo "${SLOT_PIDS[@]}" | tr ' ' '\n' | grep -cv '^0$')" -gt 0 ]; do # Check for finished slots for slot in $(seq 0 $((CONCURRENCY - 1))); do if [ "${SLOT_PIDS[$slot]}" -ne 0 ]; then if ! kill -0 "${SLOT_PIDS[$slot]}" 2>/dev/null; then # Job finished wait "${SLOT_PIDS[$slot]}" 2>/dev/null if [ $? -eq 0 ]; then DONE=$((DONE + 1)) else FAILED=$((FAILED + 1)) fi SLOT_PIDS[$slot]=0 SLOT_PROBLEMS[$slot]="" fi fi done # Assign problems to free slots for slot in $(seq 0 $((CONCURRENCY - 1))); do if [ "${SLOT_PIDS[$slot]}" -eq 0 ] && [ ${IDX} -lt ${TOTAL} ]; then pid="${PIDS[$IDX]}" port=$((BASE_PORT + slot)) echo "START problem ${pid} on slot ${slot} (port ${port}) [${DONE}+${FAILED}/${TOTAL} complete]" run_problem_vanilla "${pid}" "${port}" & SLOT_PIDS[$slot]=$! SLOT_PROBLEMS[$slot]="${pid}" IDX=$((IDX + 1)) fi done sleep 2 done echo "" echo "========================================" echo "Parallel vanilla (via eval service) complete" echo " Succeeded: ${DONE}" echo " Failed: ${FAILED}" echo " Total: ${TOTAL}" echo " Run dir: ${RUN_DIR}" echo " Logs: ${LOG_DIR}/" echo "========================================"