shinka-backup / scripts /dev /run_frontier_cs_batch.sh

Add files using upload-large-folder tool

3f6526a verified 18 days ago

3.98 kB

	#!/bin/bash
	# Batch run: Frontier-CS algorithmic problems with eval service.
	#
	# Prerequisites: Eval service should be running
	# OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
	# .venv/bin/python eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755

	set -euo pipefail
	cd "$(dirname "$0")/../.."

	PYTHON=".venv/bin/python"

	# ============================================================================
	# Configuration
	# ============================================================================
	GENS=50
	PARALLEL=4
	SEED_MODEL="gemini3pro"
	EVAL_PORT=8755
	EVAL_URL="http://localhost:${EVAL_PORT}"
	EVAL_TRIGGER_MODE="periodic"
	EVAL_TRIGGER_INTERVAL=1000 # effectively never triggers agent

	# Problem range (inclusive)
	PID_START="${1:-0}"
	PID_END="${2:-49}"

	# ============================================================================
	# Start eval service in the background (if not already running)
	# ============================================================================
	if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
	echo "Eval service already running at ${EVAL_URL}"
	else
	echo "Starting eval service on port ${EVAL_PORT}..."
	OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
	${PYTHON} eval_agent/ev2_service_standalone.py \
	--host "0.0.0.0" --port "${EVAL_PORT}" &
	EVAL_PID=$!

	for i in $(seq 1 30); do
	if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
	echo "Eval service ready (pid=${EVAL_PID})"
	break
	fi
	sleep 1
	done

	if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
	echo "ERROR: Eval service failed to start"
	kill "${EVAL_PID}" 2>/dev/null \|\| true
	exit 1
	fi

	trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null \|\| true" EXIT
	fi

	# ============================================================================
	# Collect valid problem IDs in range
	# ============================================================================
	PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems"
	SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions"
	PIDS=()

	for pid in $(ls "${PROBLEMS_DIR}" \| sort -n); do
	if [ "${pid}" -ge "${PID_START}" ] 2>/dev/null && [ "${pid}" -le "${PID_END}" ] 2>/dev/null; then
	# Ensure solution exists for this problem
	if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then
	PIDS+=("${pid}")
	else
	echo "SKIP problem ${pid}: no solutions available"
	fi
	fi
	done

	echo "========================================"
	echo "Frontier-CS Batch Run"
	echo "========================================"
	echo " Problems: ${PIDS[*]}"
	echo " Total: ${#PIDS[@]}"
	echo " Generations: ${GENS}"
	echo " Parallel: ${PARALLEL}"
	echo " Seed model: ${SEED_MODEL}"
	echo " Eval agent: disabled (interval=${EVAL_TRIGGER_INTERVAL})"
	echo "========================================"
	echo ""

	DONE=0
	FAILED=0

	for PID in "${PIDS[@]}"; do
	echo "----------------------------------------"
	echo "[${DONE}/${#PIDS[@]}] Problem ${PID}"
	echo "----------------------------------------"

	if ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
	--experiment-name "batch_g${GENS}" \
	--problem-id "${PID}" \
	--seed-model "${SEED_MODEL}" \
	--num-generations "${GENS}" \
	--max-parallel-jobs "${PARALLEL}" \
	--use-eval-service \
	--eval-service-url "${EVAL_URL}" \
	--eval-trigger-mode "${EVAL_TRIGGER_MODE}" \
	--eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \
	--verbose; then
	DONE=$((DONE + 1))
	echo "OK problem ${PID}"
	else
	FAILED=$((FAILED + 1))
	echo "FAILED problem ${PID}"
	fi
	echo ""
	done

	echo "========================================"
	echo "Batch complete: ${DONE} succeeded, ${FAILED} failed out of ${#PIDS[@]}"
	echo "========================================"