File size: 2,463 Bytes
3f6526a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
# Smoke test: run Frontier-CS on a few problems to verify the full pipeline.
#
# Prerequisites: Eval service should be running
#   bash scripts/dev/start_eval_server.sh
# OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 uv run eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755

set -euo pipefail
cd "$(dirname "$0")/../.."

PYTHON=".venv/bin/python"
GENS=10
PARALLEL=2

# ============================================================================
# Start eval service in the background (if not already running)
# ============================================================================
EVAL_PORT=8755
EVAL_URL="http://localhost:${EVAL_PORT}"

if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
    echo "Eval service already running at ${EVAL_URL}"
else
    echo "Starting eval service on port ${EVAL_PORT}..."
    OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
        ${PYTHON} eval_agent/ev2_service_standalone.py \
        --host "0.0.0.0" --port "${EVAL_PORT}" &
    EVAL_PID=$!

    # Wait for service to become ready
    for i in $(seq 1 30); do
        if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
            echo "Eval service ready (pid=${EVAL_PID})"
            break
        fi
        sleep 1
    done

    if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
        echo "ERROR: Eval service failed to start"
        kill "${EVAL_PID}" 2>/dev/null || true
        exit 1
    fi

    # Clean up eval service on script exit
    trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT
fi

echo "========================================"
echo "Frontier-CS Smoke Test"
echo "========================================"
echo ""

for PID in 0 1; do
    echo "----------------------------------------"
    echo "Problem ${PID} (${GENS} generations)"
    echo "----------------------------------------"
    ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
        --experiment-name "smoke_p${PID}" \
        --problem-id "${PID}" \
        --seed-model gemini3pro \
        --num-generations "${GENS}" \
        --max-parallel-jobs "${PARALLEL}" \
        --use-eval-service \
        --eval-service-url "${EVAL_URL}" \
        --eval-trigger-mode periodic \
        --eval-trigger-interval 5
    echo ""
done

echo "========================================"
echo "Smoke test complete"
echo "========================================"