#!/bin/bash # Agentic run: Frontier-CS algorithmic with eval agent, forked from vanilla baseline. # Based on scripts/dev/run_frontier_cs_parallel_with_agent.sh but runs from a forked dir. # # Prerequisites: # 1. Fork the baseline first: # bash scripts/ev2_agentic/fork_frontier_cs_baseline.sh # # 2. Start the judge service (if not already running): # (cd tasks/Frontier-CS/algorithmic && node judge/src/server.js) # # NOTE: Eval services are started automatically per worker. # # Usage: # FORKED_DIR=results/frontier_cs_algorithmic/agent_fork_g5_YYYYMMDD_HHMMSS \ # bash scripts/ev2_agentic/run_frontier_cs_agentic.sh # # # Custom parallelism: # FORKED_DIR=... CONCURRENCY=8 \ # bash scripts/ev2_agentic/run_frontier_cs_agentic.sh set -euo pipefail cd "$(dirname "$0")/../.." PYTHON=".venv/bin/python" # ============================================================================ # Configuration # ============================================================================ FORKED_DIR="${FORKED_DIR:?Set FORKED_DIR to the forked experiment directory}" CONCURRENCY="${CONCURRENCY:-20}" GENS=50 LLM_MODELS="native-gemini-3-flash-preview" BASE_PORT="${BASE_EVAL_PORT:-8760}" EVAL_TRIGGER_MODE="periodic" EVAL_TRIGGER_INTERVAL=5 LOG_DIR="${FORKED_DIR}/_worker_logs" mkdir -p "${LOG_DIR}" # ============================================================================ # Collect problem IDs from forked directory # ============================================================================ PIDS=() for pdir in $(ls -d "${FORKED_DIR}"/p* 2>/dev/null | sort -t'p' -k2 -n); do pname=$(basename "${pdir}") PIDS+=("${pname#p}") done TOTAL=${#PIDS[@]} echo "========================================" echo "Frontier-CS Agentic (forked baseline)" echo "========================================" echo " Forked dir: ${FORKED_DIR}" echo " Problems: ${TOTAL}" echo " Concurrency: ${CONCURRENCY}" echo " Generations: ${GENS}" echo " LLM: ${LLM_MODELS}" echo " Eval ports: ${BASE_PORT}-$((BASE_PORT + CONCURRENCY - 1))" echo " Trigger: ${EVAL_TRIGGER_MODE} every ${EVAL_TRIGGER_INTERVAL} gens" echo " Logs: ${LOG_DIR}/" echo "========================================" echo "" # ============================================================================ # Track all background PIDs for cleanup # ============================================================================ ALL_SERVICE_PIDS=() cleanup() { echo "" echo "Cleaning up eval services..." for spid in "${ALL_SERVICE_PIDS[@]+"${ALL_SERVICE_PIDS[@]}"}"; do kill "${spid}" 2>/dev/null || true done jobs -p | xargs -r kill 2>/dev/null || true echo "Done." } trap cleanup EXIT INT TERM # ============================================================================ # Start/stop eval service helpers # ============================================================================ start_eval_service() { local port="$1" local log_file="${LOG_DIR}/eval_service_port_${port}.log" local url="http://localhost:${port}" if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then echo " Eval service already running on port ${port}" return 0 fi OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ ${PYTHON} eval_agent/ev2_service_standalone.py \ --host "0.0.0.0" --port "${port}" \ > "${log_file}" 2>&1 & local spid=$! ALL_SERVICE_PIDS+=("${spid}") for i in $(seq 1 20); do if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then echo " Eval service ready on port ${port} (pid=${spid})" return 0 fi sleep 1 done echo " ERROR: Eval service failed to start on port ${port}" return 1 } # ============================================================================ # Worker function: run one problem with its own eval service # ============================================================================ run_problem_with_agent() { local pid="$1" local port="$2" local url="http://localhost:${port}" local results_dir="${FORKED_DIR}/p${pid}" ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ --experiment-name "frontier_cs_agentic_p${pid}_g${GENS}" \ --problem-id "${pid}" \ --num-generations "${GENS}" \ --max-parallel-jobs 1 \ --edit-backend single_shot_patch \ --llm-models ${LLM_MODELS} \ --results-dir "${results_dir}" \ --use-eval-service \ --eval-service-url "${url}" \ --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \ --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \ --use-wandb \ --wandb-project frontier-cs \ --wandb-tags frontier_cs agent forked_g5 problem_${pid} \ --verbose \ --trajectory-log \ > "${LOG_DIR}/problem_${pid}.log" 2>&1 local status=$? if [ ${status} -eq 0 ]; then echo "DONE problem ${pid} (port ${port})" else echo "FAIL problem ${pid} (port ${port}, see ${LOG_DIR}/problem_${pid}.log)" fi return ${status} } # ============================================================================ # Start eval services (one per concurrency slot) # ============================================================================ echo "Starting ${CONCURRENCY} eval services..." for slot in $(seq 0 $((CONCURRENCY - 1))); do port=$((BASE_PORT + slot)) start_eval_service "${port}" done echo "" # ============================================================================ # Run problems in parallel, assigning to available slots # ============================================================================ SLOT_PIDS=() SLOT_PROBLEMS=() for slot in $(seq 0 $((CONCURRENCY - 1))); do SLOT_PIDS+=(0) SLOT_PROBLEMS+=("") done DONE=0 FAILED=0 IDX=0 while [ ${IDX} -lt ${TOTAL} ] || [ "$(echo "${SLOT_PIDS[@]}" | tr ' ' '\n' | grep -cv '^0$')" -gt 0 ]; do # Check for finished slots for slot in $(seq 0 $((CONCURRENCY - 1))); do if [ "${SLOT_PIDS[$slot]}" -ne 0 ]; then if ! kill -0 "${SLOT_PIDS[$slot]}" 2>/dev/null; then wait "${SLOT_PIDS[$slot]}" 2>/dev/null if [ $? -eq 0 ]; then DONE=$((DONE + 1)) else FAILED=$((FAILED + 1)) fi SLOT_PIDS[$slot]=0 SLOT_PROBLEMS[$slot]="" fi fi done # Assign problems to free slots for slot in $(seq 0 $((CONCURRENCY - 1))); do if [ "${SLOT_PIDS[$slot]}" -eq 0 ] && [ ${IDX} -lt ${TOTAL} ]; then pid="${PIDS[$IDX]}" port=$((BASE_PORT + slot)) echo "START problem ${pid} on slot ${slot} (port ${port}) [${DONE}+${FAILED}/${TOTAL} complete]" run_problem_with_agent "${pid}" "${port}" & SLOT_PIDS[$slot]=$! SLOT_PROBLEMS[$slot]="${pid}" IDX=$((IDX + 1)) fi done sleep 2 done echo "" echo "========================================" echo "Parallel agentic run complete" echo " Succeeded: ${DONE}" echo " Failed: ${FAILED}" echo " Total: ${TOTAL}" echo " Results: ${FORKED_DIR}" echo " Logs: ${LOG_DIR}/" echo "========================================"