shinka-backup / scripts /dev /run_frontier_cs_batch.sh
JustinTX's picture
Add files using upload-large-folder tool
3f6526a verified
#!/bin/bash
# Batch run: Frontier-CS algorithmic problems with eval service.
#
# Prerequisites: Eval service should be running
# OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
# .venv/bin/python eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755
set -euo pipefail
cd "$(dirname "$0")/../.."
PYTHON=".venv/bin/python"
# ============================================================================
# Configuration
# ============================================================================
GENS=50
PARALLEL=4
SEED_MODEL="gemini3pro"
EVAL_PORT=8755
EVAL_URL="http://localhost:${EVAL_PORT}"
EVAL_TRIGGER_MODE="periodic"
EVAL_TRIGGER_INTERVAL=1000 # effectively never triggers agent
# Problem range (inclusive)
PID_START="${1:-0}"
PID_END="${2:-49}"
# ============================================================================
# Start eval service in the background (if not already running)
# ============================================================================
if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "Eval service already running at ${EVAL_URL}"
else
echo "Starting eval service on port ${EVAL_PORT}..."
OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
${PYTHON} eval_agent/ev2_service_standalone.py \
--host "0.0.0.0" --port "${EVAL_PORT}" &
EVAL_PID=$!
for i in $(seq 1 30); do
if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "Eval service ready (pid=${EVAL_PID})"
break
fi
sleep 1
done
if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "ERROR: Eval service failed to start"
kill "${EVAL_PID}" 2>/dev/null || true
exit 1
fi
trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT
fi
# ============================================================================
# Collect valid problem IDs in range
# ============================================================================
PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems"
SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions"
PIDS=()
for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do
if [ "${pid}" -ge "${PID_START}" ] 2>/dev/null && [ "${pid}" -le "${PID_END}" ] 2>/dev/null; then
# Ensure solution exists for this problem
if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then
PIDS+=("${pid}")
else
echo "SKIP problem ${pid}: no solutions available"
fi
fi
done
echo "========================================"
echo "Frontier-CS Batch Run"
echo "========================================"
echo " Problems: ${PIDS[*]}"
echo " Total: ${#PIDS[@]}"
echo " Generations: ${GENS}"
echo " Parallel: ${PARALLEL}"
echo " Seed model: ${SEED_MODEL}"
echo " Eval agent: disabled (interval=${EVAL_TRIGGER_INTERVAL})"
echo "========================================"
echo ""
DONE=0
FAILED=0
for PID in "${PIDS[@]}"; do
echo "----------------------------------------"
echo "[${DONE}/${#PIDS[@]}] Problem ${PID}"
echo "----------------------------------------"
if ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
--experiment-name "batch_g${GENS}" \
--problem-id "${PID}" \
--seed-model "${SEED_MODEL}" \
--num-generations "${GENS}" \
--max-parallel-jobs "${PARALLEL}" \
--use-eval-service \
--eval-service-url "${EVAL_URL}" \
--eval-trigger-mode "${EVAL_TRIGGER_MODE}" \
--eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \
--verbose; then
DONE=$((DONE + 1))
echo "OK problem ${PID}"
else
FAILED=$((FAILED + 1))
echo "FAILED problem ${PID}"
fi
echo ""
done
echo "========================================"
echo "Batch complete: ${DONE} succeeded, ${FAILED} failed out of ${#PIDS[@]}"
echo "========================================"