File size: 7,939 Bytes

3f6526a

#!/bin/bash
# Parallel vanilla run via eval service: same pipeline as agent but with
# eval agent trigger interval set to 1000 (effectively never triggers).
# This ensures the evaluation code path is identical to the agent run,
# making the comparison fair.
#
# Usage:
#   ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh              # all 172 problems, 20 parallel
#   ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh 50           # first 50 problems, 20 parallel
#   ./scripts/dev/run_frontier_cs_parallel_vanilla_server.sh 50 10        # first 50 problems, 10 parallel

set -euo pipefail
cd "$(dirname "$0")/../.."

PYTHON=".venv/bin/python"

# ============================================================================
# Configuration
# ============================================================================
NUM_PROBLEMS="${1:-all}"
CONCURRENCY="${2:-20}"

GENS=50
SEED_MODEL="gemini3pro"
LLM_MODELS="native-gemini-3-flash-preview"
BASE_PORT=8860   # different port range from agent (8760) to allow concurrent runs

EVAL_TRIGGER_MODE="periodic"
EVAL_TRIGGER_INTERVAL=1000  # effectively never triggers eval agent

TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
EXP_NAME="vanilla_g${GENS}"
RUN_DIR="results/frontier_cs_algorithmic/${EXP_NAME}_${TIMESTAMP}"

PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems"
SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions"
LOG_DIR="logs/frontier_cs_parallel_vanilla"
mkdir -p "${LOG_DIR}"

# ============================================================================
# Collect valid problem IDs (sorted numerically, take first N)
# ============================================================================
ALL_PIDS=()
for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do
    if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then
        ALL_PIDS+=("${pid}")
    fi
done

if [ "${NUM_PROBLEMS}" = "all" ]; then
    PIDS=("${ALL_PIDS[@]}")
else
    PIDS=("${ALL_PIDS[@]:0:${NUM_PROBLEMS}}")
fi

echo "========================================"
echo "Frontier-CS Parallel Vanilla (eval service, no agent trigger)"
echo "========================================"
echo "  Problems:      ${#PIDS[@]} / ${#ALL_PIDS[@]} available"
echo "  Concurrency:   ${CONCURRENCY}"
echo "  Generations:   ${GENS}"
echo "  Seed model:    ${SEED_MODEL}"
echo "  LLM:           ${LLM_MODELS}"
echo "  Run dir:       ${RUN_DIR}"
echo "  Eval ports:    ${BASE_PORT}-$((BASE_PORT + CONCURRENCY - 1))"
echo "  Trigger:       ${EVAL_TRIGGER_MODE} every ${EVAL_TRIGGER_INTERVAL} gens (disabled)"
echo "  Logs:          ${LOG_DIR}/"
echo "========================================"
echo ""

# ============================================================================
# Track all background PIDs for cleanup
# ============================================================================
ALL_SERVICE_PIDS=()

cleanup() {
    echo ""
    echo "Cleaning up eval services..."
    for spid in "${ALL_SERVICE_PIDS[@]+"${ALL_SERVICE_PIDS[@]}"}"; do
        kill "${spid}" 2>/dev/null || true
    done
    # Also kill any child processes
    jobs -p | xargs -r kill 2>/dev/null || true
    echo "Done."
}
trap cleanup EXIT INT TERM

# ============================================================================
# Start/stop eval service helpers
# ============================================================================
start_eval_service() {
    local port="$1"
    local log_file="${LOG_DIR}/eval_service_port_${port}.log"
    local url="http://localhost:${port}"

    # Check if already running on this port
    if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then
        echo "  Eval service already running on port ${port}"
        return 0
    fi

    ${PYTHON} eval_agent/ev2_service_standalone.py \
        --host "0.0.0.0" --port "${port}" \
        > "${log_file}" 2>&1 &
    local spid=$!
    ALL_SERVICE_PIDS+=("${spid}")

    # Wait for ready (up to 20s)
    for i in $(seq 1 20); do
        if curl -s "${url}/api/v1/status" > /dev/null 2>&1; then
            echo "  Eval service ready on port ${port} (pid=${spid})"
            return 0
        fi
        sleep 1
    done

    echo "  ERROR: Eval service failed to start on port ${port}"
    return 1
}

# ============================================================================
# Worker function: run one problem via eval service (agent never triggers)
# ============================================================================
run_problem_vanilla() {
    local pid="$1"
    local port="$2"
    local url="http://localhost:${port}"

    export FRONTIER_CS_PROBLEM_ID="${pid}"
    export FRONTIER_CS_JUDGE_URL="http://localhost:8081"

    ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
        --experiment-name "${EXP_NAME}" \
        --problem-id "${pid}" \
        --seed-model "${SEED_MODEL}" \
        --num-generations "${GENS}" \
        --max-parallel-jobs 1 \
        --edit-backend single_shot_patch \
        --llm-models ${LLM_MODELS} \
        --run-dir "${RUN_DIR}" \
        --use-eval-service \
        --eval-service-url "${url}" \
        --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \
        --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \
        --use-wandb \
        --wandb-project frontier-cs \
        --wandb-tags frontier_cs vanilla problem_${pid} \
        --verbose \
        > "${LOG_DIR}/problem_${pid}.log" 2>&1

    local status=$?
    if [ ${status} -eq 0 ]; then
        echo "DONE problem ${pid} (port ${port})"
    else
        echo "FAIL problem ${pid} (port ${port}, see ${LOG_DIR}/problem_${pid}.log)"
    fi
    return ${status}
}

# ============================================================================
# Start eval services (one per concurrency slot)
# ============================================================================
echo "Starting ${CONCURRENCY} eval services..."
for slot in $(seq 0 $((CONCURRENCY - 1))); do
    port=$((BASE_PORT + slot))
    start_eval_service "${port}"
done
echo ""

# ============================================================================
# Run problems in parallel, assigning to available slots
# ============================================================================
# Slot tracking: which slot is free
SLOT_PIDS=()       # bash PID per slot (0 = free)
SLOT_PROBLEMS=()   # problem ID per slot
for slot in $(seq 0 $((CONCURRENCY - 1))); do
    SLOT_PIDS+=(0)
    SLOT_PROBLEMS+=("")
done

DONE=0
FAILED=0
TOTAL=${#PIDS[@]}
IDX=0

while [ ${IDX} -lt ${TOTAL} ] || [ "$(echo "${SLOT_PIDS[@]}" | tr ' ' '\n' | grep -cv '^0$')" -gt 0 ]; do
    # Check for finished slots
    for slot in $(seq 0 $((CONCURRENCY - 1))); do
        if [ "${SLOT_PIDS[$slot]}" -ne 0 ]; then
            if ! kill -0 "${SLOT_PIDS[$slot]}" 2>/dev/null; then
                # Job finished
                wait "${SLOT_PIDS[$slot]}" 2>/dev/null
                if [ $? -eq 0 ]; then
                    DONE=$((DONE + 1))
                else
                    FAILED=$((FAILED + 1))
                fi
                SLOT_PIDS[$slot]=0
                SLOT_PROBLEMS[$slot]=""
            fi
        fi
    done

    # Assign problems to free slots
    for slot in $(seq 0 $((CONCURRENCY - 1))); do
        if [ "${SLOT_PIDS[$slot]}" -eq 0 ] && [ ${IDX} -lt ${TOTAL} ]; then
            pid="${PIDS[$IDX]}"
            port=$((BASE_PORT + slot))
            echo "START problem ${pid} on slot ${slot} (port ${port}) [${DONE}+${FAILED}/${TOTAL} complete]"
            run_problem_vanilla "${pid}" "${port}" &
            SLOT_PIDS[$slot]=$!
            SLOT_PROBLEMS[$slot]="${pid}"
            IDX=$((IDX + 1))
        fi
    done

    sleep 2
done

echo ""
echo "========================================"
echo "Parallel vanilla (via eval service) complete"
echo "  Succeeded: ${DONE}"
echo "  Failed:    ${FAILED}"
echo "  Total:     ${TOTAL}"
echo "  Run dir:   ${RUN_DIR}"
echo "  Logs:      ${LOG_DIR}/"
echo "========================================"