| #!/bin/bash |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
| cd "$(dirname "$0")/../.." |
|
|
| PYTHON=".venv/bin/python" |
|
|
| |
| |
| |
| GENS=50 |
| PARALLEL=4 |
| SEED_MODEL="gemini3pro" |
| EVAL_PORT=8755 |
| EVAL_URL="http://localhost:${EVAL_PORT}" |
| EVAL_TRIGGER_MODE="periodic" |
| EVAL_TRIGGER_INTERVAL=1000 |
|
|
| |
| PID_START="${1:-0}" |
| PID_END="${2:-49}" |
|
|
| |
| |
| |
| if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then |
| echo "Eval service already running at ${EVAL_URL}" |
| else |
| echo "Starting eval service on port ${EVAL_PORT}..." |
| OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ |
| ${PYTHON} eval_agent/ev2_service_standalone.py \ |
| --host "0.0.0.0" --port "${EVAL_PORT}" & |
| EVAL_PID=$! |
|
|
| for i in $(seq 1 30); do |
| if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then |
| echo "Eval service ready (pid=${EVAL_PID})" |
| break |
| fi |
| sleep 1 |
| done |
|
|
| if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then |
| echo "ERROR: Eval service failed to start" |
| kill "${EVAL_PID}" 2>/dev/null || true |
| exit 1 |
| fi |
|
|
| trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT |
| fi |
|
|
| |
| |
| |
| PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems" |
| SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions" |
| PIDS=() |
|
|
| for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do |
| if [ "${pid}" -ge "${PID_START}" ] 2>/dev/null && [ "${pid}" -le "${PID_END}" ] 2>/dev/null; then |
| |
| if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then |
| PIDS+=("${pid}") |
| else |
| echo "SKIP problem ${pid}: no solutions available" |
| fi |
| fi |
| done |
|
|
| echo "========================================" |
| echo "Frontier-CS Batch Run" |
| echo "========================================" |
| echo " Problems: ${PIDS[*]}" |
| echo " Total: ${#PIDS[@]}" |
| echo " Generations: ${GENS}" |
| echo " Parallel: ${PARALLEL}" |
| echo " Seed model: ${SEED_MODEL}" |
| echo " Eval agent: disabled (interval=${EVAL_TRIGGER_INTERVAL})" |
| echo "========================================" |
| echo "" |
|
|
| DONE=0 |
| FAILED=0 |
|
|
| for PID in "${PIDS[@]}"; do |
| echo "----------------------------------------" |
| echo "[${DONE}/${#PIDS[@]}] Problem ${PID}" |
| echo "----------------------------------------" |
|
|
| if ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ |
| --experiment-name "batch_g${GENS}" \ |
| --problem-id "${PID}" \ |
| --seed-model "${SEED_MODEL}" \ |
| --num-generations "${GENS}" \ |
| --max-parallel-jobs "${PARALLEL}" \ |
| --use-eval-service \ |
| --eval-service-url "${EVAL_URL}" \ |
| --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \ |
| --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \ |
| --verbose; then |
| DONE=$((DONE + 1)) |
| echo "OK problem ${PID}" |
| else |
| FAILED=$((FAILED + 1)) |
| echo "FAILED problem ${PID}" |
| fi |
| echo "" |
| done |
|
|
| echo "========================================" |
| echo "Batch complete: ${DONE} succeeded, ${FAILED} failed out of ${#PIDS[@]}" |
| echo "========================================" |
|
|