File size: 3,977 Bytes
3f6526a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
# Batch run: Frontier-CS algorithmic problems with eval service.
#
# Prerequisites: Eval service should be running
#   OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
#     .venv/bin/python eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755

set -euo pipefail
cd "$(dirname "$0")/../.."

PYTHON=".venv/bin/python"

# ============================================================================
# Configuration
# ============================================================================
GENS=50
PARALLEL=4
SEED_MODEL="gemini3pro"
EVAL_PORT=8755
EVAL_URL="http://localhost:${EVAL_PORT}"
EVAL_TRIGGER_MODE="periodic"
EVAL_TRIGGER_INTERVAL=1000   # effectively never triggers agent

# Problem range (inclusive)
PID_START="${1:-0}"
PID_END="${2:-49}"

# ============================================================================
# Start eval service in the background (if not already running)
# ============================================================================
if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
    echo "Eval service already running at ${EVAL_URL}"
else
    echo "Starting eval service on port ${EVAL_PORT}..."
    OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
        ${PYTHON} eval_agent/ev2_service_standalone.py \
        --host "0.0.0.0" --port "${EVAL_PORT}" &
    EVAL_PID=$!

    for i in $(seq 1 30); do
        if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
            echo "Eval service ready (pid=${EVAL_PID})"
            break
        fi
        sleep 1
    done

    if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
        echo "ERROR: Eval service failed to start"
        kill "${EVAL_PID}" 2>/dev/null || true
        exit 1
    fi

    trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT
fi

# ============================================================================
# Collect valid problem IDs in range
# ============================================================================
PROBLEMS_DIR="tasks/Frontier-CS/algorithmic/problems"
SOLUTIONS_DIR="tasks/Frontier-CS/algorithmic/solutions"
PIDS=()

for pid in $(ls "${PROBLEMS_DIR}" | sort -n); do
    if [ "${pid}" -ge "${PID_START}" ] 2>/dev/null && [ "${pid}" -le "${PID_END}" ] 2>/dev/null; then
        # Ensure solution exists for this problem
        if [ -d "${SOLUTIONS_DIR}/${pid}" ]; then
            PIDS+=("${pid}")
        else
            echo "SKIP problem ${pid}: no solutions available"
        fi
    fi
done

echo "========================================"
echo "Frontier-CS Batch Run"
echo "========================================"
echo "  Problems:    ${PIDS[*]}"
echo "  Total:       ${#PIDS[@]}"
echo "  Generations: ${GENS}"
echo "  Parallel:    ${PARALLEL}"
echo "  Seed model:  ${SEED_MODEL}"
echo "  Eval agent:  disabled (interval=${EVAL_TRIGGER_INTERVAL})"
echo "========================================"
echo ""

DONE=0
FAILED=0

for PID in "${PIDS[@]}"; do
    echo "----------------------------------------"
    echo "[${DONE}/${#PIDS[@]}] Problem ${PID}"
    echo "----------------------------------------"

    if ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
        --experiment-name "batch_g${GENS}" \
        --problem-id "${PID}" \
        --seed-model "${SEED_MODEL}" \
        --num-generations "${GENS}" \
        --max-parallel-jobs "${PARALLEL}" \
        --use-eval-service \
        --eval-service-url "${EVAL_URL}" \
        --eval-trigger-mode "${EVAL_TRIGGER_MODE}" \
        --eval-trigger-interval "${EVAL_TRIGGER_INTERVAL}" \
        --verbose; then
        DONE=$((DONE + 1))
        echo "OK problem ${PID}"
    else
        FAILED=$((FAILED + 1))
        echo "FAILED problem ${PID}"
    fi
    echo ""
done

echo "========================================"
echo "Batch complete: ${DONE} succeeded, ${FAILED} failed out of ${#PIDS[@]}"
echo "========================================"