#!/bin/bash # Smoke test: run Frontier-CS on a few problems to verify the full pipeline. # # Prerequisites: Eval service should be running # bash scripts/dev/start_eval_server.sh # OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 uv run eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755 set -euo pipefail cd "$(dirname "$0")/../.." PYTHON=".venv/bin/python" GENS=10 PARALLEL=2 # ============================================================================ # Start eval service in the background (if not already running) # ============================================================================ EVAL_PORT=8755 EVAL_URL="http://localhost:${EVAL_PORT}" if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "Eval service already running at ${EVAL_URL}" else echo "Starting eval service on port ${EVAL_PORT}..." OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \ ${PYTHON} eval_agent/ev2_service_standalone.py \ --host "0.0.0.0" --port "${EVAL_PORT}" & EVAL_PID=$! # Wait for service to become ready for i in $(seq 1 30); do if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "Eval service ready (pid=${EVAL_PID})" break fi sleep 1 done if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then echo "ERROR: Eval service failed to start" kill "${EVAL_PID}" 2>/dev/null || true exit 1 fi # Clean up eval service on script exit trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT fi echo "========================================" echo "Frontier-CS Smoke Test" echo "========================================" echo "" for PID in 0 1; do echo "----------------------------------------" echo "Problem ${PID} (${GENS} generations)" echo "----------------------------------------" ${PYTHON} tasks/frontier_cs_entry/run_experiment.py \ --experiment-name "smoke_p${PID}" \ --problem-id "${PID}" \ --seed-model gemini3pro \ --num-generations "${GENS}" \ --max-parallel-jobs "${PARALLEL}" \ --use-eval-service \ --eval-service-url "${EVAL_URL}" \ --eval-trigger-mode periodic \ --eval-trigger-interval 5 echo "" done echo "========================================" echo "Smoke test complete" echo "========================================"