| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
| cd "$(dirname "$0")/../.." |
|
|
| PYTHON=".venv/bin/python" |
|
|
| |
| |
| |
| FORKED_DIR="${FORKED_DIR:?Set FORKED_DIR to the forked experiment directory}" |
| CONCURRENCY="${CONCURRENCY:-20}" |
|
|
| GENS=50 |
| LLM_MODELS="native-gemini-3-flash-preview" |
| BASE_PORT="${BASE_EVAL_PORT:-8760}" |
|
|
| EVAL_TRIGGER_MODE="periodic" |
| EVAL_TRIGGER_INTERVAL=5 |
|
|
| LOG_DIR="${FORKED_DIR}/_worker_logs" |
| mkdir -p "${LOG_DIR}" |
|
|
| |
| |
| |
| PIDS=() |
| for pdir in $(ls -d "${FORKED_DIR}"/p* 2>/dev/null | sort -t'p' -k2 -n); do |
| pname=$(basename "${pdir}") |
| PIDS+=("${pname#p}") |
| done |
|
|
| TOTAL=${#PIDS[@]} |
|
|
| echo "========================================" |
| echo "Frontier-CS Agentic (forked baseline)" |
| echo "========================================" |
| echo " Forked dir: ${FORKED_DIR}" |
| echo " Problems: ${TOTAL}" |
| echo " Concurrency: ${CONCURRENCY}" |
| echo " Generations: ${GENS}" |
| echo " LLM: ${LLM_MODELS}" |
| echo " Eval ports: ${BASE_PORT}-$((BASE_PORT + CONCURRENCY - 1))" |
| echo " Trigger: ${EVAL_TRIGGER_MODE} every ${EVAL_TRIGGER_INTERVAL} gens" |
| echo " Logs: ${LOG_DIR}/" |
| echo "========================================" |
| echo "" |
|
|
| |
| |
| |
| ALL_SERVICE_PIDS=() |
|
|
| cleanup() { |
| echo "" |
| echo "Cleaning up eval services..." |
| for spid in "${ALL_SERVICE_PIDS[@]+"${ALL_SERVICE_PIDS[@]}"}"; do |
| kill "${spid}" 2>/dev/null || true |
| done |
| jobs -p | xargs -r kill 2>/dev/null || true |
| echo "Done." |
| } |
| trap cleanup EXIT INT TERM |
|
|
| |
| |
| |
| start_eval_service() { |
| local port="$1" |
| local log_file="${LOG_DIR}/eval_service_port_${port}.log" |
| local url="http://localhost:${port}" |
|
|
| if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then |
| echo " Eval service already running on port ${port}" |
| return 0 |
| fi |
|
|
| OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ |
| ${PYTHON} eval_agent/ev2_service_standalone.py \ |
| --host "0.0.0.0" --port "${port}" \ |
| > "${log_file}" 2>&1 & |
| local spid=$! |
| ALL_SERVICE_PIDS+=("${spid}") |
|
|
| for i in $(seq 1 20); do |
| if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then |
| echo " Eval service ready on port ${port} (pid=${spid})" |
| return 0 |
| fi |
| sleep 1 |
| done |
|
|
| echo " ERROR: Eval service failed to start on port ${port}" |
| return 1 |
| } |
|
|
| |
| |
| |
| run_problem_with_agent() { |
| local pid="$1" |
| local port="$2" |
| local url="http://localhost:${port}" |
| local results_dir="${FORKED_DIR}/p${pid}" |
|
|
| ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ |
| --experiment-name "frontier_cs_agentic_p${pid}_g${GENS}" \ |
| --problem-id "${pid}" \ |
| --num-generations "${GENS}" \ |
| --max-parallel-jobs 1 \ |
| --edit-backend single_shot_patch \ |
| --llm-models ${LLM_MODELS} \ |
| --results-dir "${results_dir}" \ |
| --use-eval-service \ |
| --eval-service-url "${url}" \ |
| --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \ |
| --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \ |
| --use-wandb \ |
| --wandb-project frontier-cs \ |
| --wandb-tags frontier_cs agent forked_g5 problem_${pid} \ |
| --verbose \ |
| --trajectory-log \ |
| > "${LOG_DIR}/problem_${pid}.log" 2>&1 |
|
|
| local status=$? |
| if [ ${status} -eq 0 ]; then |
| echo "DONE problem ${pid} (port ${port})" |
| else |
| echo "FAIL problem ${pid} (port ${port}, see ${LOG_DIR}/problem_${pid}.log)" |
| fi |
| return ${status} |
| } |
|
|
| |
| |
| |
| echo "Starting ${CONCURRENCY} eval services..." |
| for slot in $(seq 0 $((CONCURRENCY - 1))); do |
| port=$((BASE_PORT + slot)) |
| start_eval_service "${port}" |
| done |
| echo "" |
|
|
| |
| |
| |
| SLOT_PIDS=() |
| SLOT_PROBLEMS=() |
| for slot in $(seq 0 $((CONCURRENCY - 1))); do |
| SLOT_PIDS+=(0) |
| SLOT_PROBLEMS+=("") |
| done |
|
|
| DONE=0 |
| FAILED=0 |
| IDX=0 |
|
|
| while [ ${IDX} -lt ${TOTAL} ] || [ "$(echo "${SLOT_PIDS[@]}" | tr ' ' '\n' | grep -cv '^0$')" -gt 0 ]; do |
| |
| for slot in $(seq 0 $((CONCURRENCY - 1))); do |
| if [ "${SLOT_PIDS[$slot]}" -ne 0 ]; then |
| if ! kill -0 "${SLOT_PIDS[$slot]}" 2>/dev/null; then |
| wait "${SLOT_PIDS[$slot]}" 2>/dev/null |
| if [ $? -eq 0 ]; then |
| DONE=$((DONE + 1)) |
| else |
| FAILED=$((FAILED + 1)) |
| fi |
| SLOT_PIDS[$slot]=0 |
| SLOT_PROBLEMS[$slot]="" |
| fi |
| fi |
| done |
|
|
| |
| for slot in $(seq 0 $((CONCURRENCY - 1))); do |
| if [ "${SLOT_PIDS[$slot]}" -eq 0 ] && [ ${IDX} -lt ${TOTAL} ]; then |
| pid="${PIDS[$IDX]}" |
| port=$((BASE_PORT + slot)) |
| echo "START problem ${pid} on slot ${slot} (port ${port}) [${DONE}+${FAILED}/${TOTAL} complete]" |
| run_problem_with_agent "${pid}" "${port}" & |
| SLOT_PIDS[$slot]=$! |
| SLOT_PROBLEMS[$slot]="${pid}" |
| IDX=$((IDX + 1)) |
| fi |
| done |
|
|
| sleep 2 |
| done |
|
|
| echo "" |
| echo "========================================" |
| echo "Parallel agentic run complete" |
| echo " Succeeded: ${DONE}" |
| echo " Failed: ${FAILED}" |
| echo " Total: ${TOTAL}" |
| echo " Results: ${FORKED_DIR}" |
| echo " Logs: ${LOG_DIR}/" |
| echo "========================================" |
|
|