shinka-backup / scripts /ev2_agentic /run_frontier_cs_agentic_from_fork.sh
JustinTX's picture
Add files using upload-large-folder tool
3f6526a verified
#!/bin/bash
# Agentic run: Frontier-CS algorithmic with eval agent, forked from vanilla baseline.
# Based on scripts/dev/run_frontier_cs_parallel_with_agent.sh but runs from a forked dir.
#
# Prerequisites:
# 1. Fork the baseline first:
# bash scripts/ev2_agentic/fork_frontier_cs_baseline.sh
#
# 2. Start the judge service (if not already running):
# (cd tasks/Frontier-CS/algorithmic && node judge/src/server.js)
#
# NOTE: Eval services are started automatically per worker.
#
# Usage:
# FORKED_DIR=results/frontier_cs_algorithmic/agent_fork_g5_YYYYMMDD_HHMMSS \
# bash scripts/ev2_agentic/run_frontier_cs_agentic.sh
#
# # Custom parallelism:
# FORKED_DIR=... CONCURRENCY=8 \
# bash scripts/ev2_agentic/run_frontier_cs_agentic.sh
set -euo pipefail
cd "$(dirname "$0")/../.."
PYTHON=".venv/bin/python"
# ============================================================================
# Configuration
# ============================================================================
FORKED_DIR="${FORKED_DIR:?Set FORKED_DIR to the forked experiment directory}"
CONCURRENCY="${CONCURRENCY:-20}"
GENS=50
LLM_MODELS="native-gemini-3-flash-preview"
BASE_PORT="${BASE_EVAL_PORT:-8760}"
EVAL_TRIGGER_MODE="periodic"
EVAL_TRIGGER_INTERVAL=5
LOG_DIR="${FORKED_DIR}/_worker_logs"
mkdir -p "${LOG_DIR}"
# ============================================================================
# Collect problem IDs from forked directory
# ============================================================================
PIDS=()
for pdir in $(ls -d "${FORKED_DIR}"/p* 2>/dev/null | sort -t'p' -k2 -n); do
pname=$(basename "${pdir}")
PIDS+=("${pname#p}")
done
TOTAL=${#PIDS[@]}
echo "========================================"
echo "Frontier-CS Agentic (forked baseline)"
echo "========================================"
echo " Forked dir: ${FORKED_DIR}"
echo " Problems: ${TOTAL}"
echo " Concurrency: ${CONCURRENCY}"
echo " Generations: ${GENS}"
echo " LLM: ${LLM_MODELS}"
echo " Eval ports: ${BASE_PORT}-$((BASE_PORT + CONCURRENCY - 1))"
echo " Trigger: ${EVAL_TRIGGER_MODE} every ${EVAL_TRIGGER_INTERVAL} gens"
echo " Logs: ${LOG_DIR}/"
echo "========================================"
echo ""
# ============================================================================
# Track all background PIDs for cleanup
# ============================================================================
ALL_SERVICE_PIDS=()
cleanup() {
echo ""
echo "Cleaning up eval services..."
for spid in "${ALL_SERVICE_PIDS[@]+"${ALL_SERVICE_PIDS[@]}"}"; do
kill "${spid}" 2>/dev/null || true
done
jobs -p | xargs -r kill 2>/dev/null || true
echo "Done."
}
trap cleanup EXIT INT TERM
# ============================================================================
# Start/stop eval service helpers
# ============================================================================
start_eval_service() {
local port="$1"
local log_file="${LOG_DIR}/eval_service_port_${port}.log"
local url="http://localhost:${port}"
if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then
echo " Eval service already running on port ${port}"
return 0
fi
OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
${PYTHON} eval_agent/ev2_service_standalone.py \
--host "0.0.0.0" --port "${port}" \
> "${log_file}" 2>&1 &
local spid=$!
ALL_SERVICE_PIDS+=("${spid}")
for i in $(seq 1 20); do
if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then
echo " Eval service ready on port ${port} (pid=${spid})"
return 0
fi
sleep 1
done
echo " ERROR: Eval service failed to start on port ${port}"
return 1
}
# ============================================================================
# Worker function: run one problem with its own eval service
# ============================================================================
run_problem_with_agent() {
local pid="$1"
local port="$2"
local url="http://localhost:${port}"
local results_dir="${FORKED_DIR}/p${pid}"
${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
--experiment-name "frontier_cs_agentic_p${pid}_g${GENS}" \
--problem-id "${pid}" \
--num-generations "${GENS}" \
--max-parallel-jobs 1 \
--edit-backend single_shot_patch \
--llm-models ${LLM_MODELS} \
--results-dir "${results_dir}" \
--use-eval-service \
--eval-service-url "${url}" \
--eval-trigger-mode "${EVAL_TRIGGER_MODE}" \
--eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \
--use-wandb \
--wandb-project frontier-cs \
--wandb-tags frontier_cs agent forked_g5 problem_${pid} \
--verbose \
--trajectory-log \
> "${LOG_DIR}/problem_${pid}.log" 2>&1
local status=$?
if [ ${status} -eq 0 ]; then
echo "DONE problem ${pid} (port ${port})"
else
echo "FAIL problem ${pid} (port ${port}, see ${LOG_DIR}/problem_${pid}.log)"
fi
return ${status}
}
# ============================================================================
# Start eval services (one per concurrency slot)
# ============================================================================
echo "Starting ${CONCURRENCY} eval services..."
for slot in $(seq 0 $((CONCURRENCY - 1))); do
port=$((BASE_PORT + slot))
start_eval_service "${port}"
done
echo ""
# ============================================================================
# Run problems in parallel, assigning to available slots
# ============================================================================
SLOT_PIDS=()
SLOT_PROBLEMS=()
for slot in $(seq 0 $((CONCURRENCY - 1))); do
SLOT_PIDS+=(0)
SLOT_PROBLEMS+=("")
done
DONE=0
FAILED=0
IDX=0
while [ ${IDX} -lt ${TOTAL} ] || [ "$(echo "${SLOT_PIDS[@]}" | tr ' ' '\n' | grep -cv '^0$')" -gt 0 ]; do
# Check for finished slots
for slot in $(seq 0 $((CONCURRENCY - 1))); do
if [ "${SLOT_PIDS[$slot]}" -ne 0 ]; then
if ! kill -0 "${SLOT_PIDS[$slot]}" 2>/dev/null; then
wait "${SLOT_PIDS[$slot]}" 2>/dev/null
if [ $? -eq 0 ]; then
DONE=$((DONE + 1))
else
FAILED=$((FAILED + 1))
fi
SLOT_PIDS[$slot]=0
SLOT_PROBLEMS[$slot]=""
fi
fi
done
# Assign problems to free slots
for slot in $(seq 0 $((CONCURRENCY - 1))); do
if [ "${SLOT_PIDS[$slot]}" -eq 0 ] && [ ${IDX} -lt ${TOTAL} ]; then
pid="${PIDS[$IDX]}"
port=$((BASE_PORT + slot))
echo "START problem ${pid} on slot ${slot} (port ${port}) [${DONE}+${FAILED}/${TOTAL} complete]"
run_problem_with_agent "${pid}" "${port}" &
SLOT_PIDS[$slot]=$!
SLOT_PROBLEMS[$slot]="${pid}"
IDX=$((IDX + 1))
fi
done
sleep 2
done
echo ""
echo "========================================"
echo "Parallel agentic run complete"
echo " Succeeded: ${DONE}"
echo " Failed: ${FAILED}"
echo " Total: ${TOTAL}"
echo " Results: ${FORKED_DIR}"
echo " Logs: ${LOG_DIR}/"
echo "========================================"