#!/bin/bash # Batch run: Frontier-CS algorithmic problems with eval service. # # Prerequisites: Eval service should be running # OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ # .venv/bin/python eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755 set -euo pipefail cd "$(dirname "$0")/../.." PYTHON=".venv/bin/python" # ============================================================================ # Configuration # ============================================================================ GENS=50 PARALLEL=4 SEED_MODEL="gemini3pro" EVAL_PORT=8755 EVAL_URL="http://localhost:${EVAL_PORT}" EVAL_TRIGGER_MODE="periodic" EVAL_TRIGGER_INTERVAL=1000 # effectively never triggers agent # Problem range (inclusive) PID_START="${1:-0}" PID_END="${2:-49}" # ============================================================================ # Start eval service in the background (if not already running) # ============================================================================ if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "Eval service already running at ${EVAL_URL}" else echo "Starting eval service on port ${EVAL_PORT}..." OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ ${PYTHON} eval_agent/ev2_service_standalone.py \ --host "0.0.0.0" --port "${EVAL_PORT}" & EVAL_PID=$! for i in $(seq 1 30); do if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "Eval service ready (pid=${EVAL_PID})" break fi sleep 1 done if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "ERROR: Eval service failed to start" kill "${EVAL_PID}" 2>/dev/null || true exit 1 fi trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT fi # ============================================================================ # Collect valid problem IDs in range # ============================================================================ PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems" SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions" PIDS=() for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do if [ "${pid}" -ge "${PID_START}" ] 2>/dev/null && [ "${pid}" -le "${PID_END}" ] 2>/dev/null; then # Ensure solution exists for this problem if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then PIDS+=("${pid}") else echo "SKIP problem ${pid}: no solutions available" fi fi done echo "========================================" echo "Frontier-CS Batch Run" echo "========================================" echo " Problems: ${PIDS[*]}" echo " Total: ${#PIDS[@]}" echo " Generations: ${GENS}" echo " Parallel: ${PARALLEL}" echo " Seed model: ${SEED_MODEL}" echo " Eval agent: disabled (interval=${EVAL_TRIGGER_INTERVAL})" echo "========================================" echo "" DONE=0 FAILED=0 for PID in "${PIDS[@]}"; do echo "----------------------------------------" echo "[${DONE}/${#PIDS[@]}] Problem ${PID}" echo "----------------------------------------" if ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ --experiment-name "batch_g${GENS}" \ --problem-id "${PID}" \ --seed-model "${SEED_MODEL}" \ --num-generations "${GENS}" \ --max-parallel-jobs "${PARALLEL}" \ --use-eval-service \ --eval-service-url "${EVAL_URL}" \ --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \ --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \ --verbose; then DONE=$((DONE + 1)) echo "OK problem ${PID}" else FAILED=$((FAILED + 1)) echo "FAILED problem ${PID}" fi echo "" done echo "========================================" echo "Batch complete: ${DONE} succeeded, ${FAILED} failed out of ${#PIDS[@]}" echo "========================================"