gensearcher-firered / scripts /entrypoint_body.sh
JSCPPProgrammer's picture
Fix HF exec entrypoint: generate entrypoint.sh in Docker; body in entrypoint_body.sh
c994fd2 verified
# Main startup logic (run as: bash /app/scripts/entrypoint_body.sh).
# entrypoint.sh is generated in the Dockerfile so HF can exec it without CRLF/BOM issues.
set -euo pipefail
cd /app
# Same-container vLLM: PyTorch may call getpass.getuser() before USER is set in some runtimes.
export USER="${USER:-huggingface}"
export LOGNAME="${LOGNAME:-$USER}"
export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-/tmp/torch_inductor_cache}"
export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton_cache}"
export PYTHONPATH="/app/vendor/rllm:${PYTHONPATH:-}"
# Optional: load Space secrets copied to this path
if [[ -f /app/.env.gen_image ]]; then
set -a
# shellcheck source=/dev/null
source /app/.env.gen_image
set +a
fi
if [[ "${START_VLLM_GENSEARCHER:-0}" != "1" ]]; then
case "${OPENAI_BASE_URL:-}" in
*127.0.0.1*|*localhost*)
echo "[entrypoint] WARNING: OPENAI_BASE_URL points to loopback but START_VLLM_GENSEARCHER is not 1."
echo "[entrypoint] The GenSearcher agent will get 'Connection error' unless a server listens here,"
echo "[entrypoint] or you set OPENAI_BASE_URL to an external OpenAI-compatible URL (ending in /v1)."
;;
esac
if [[ -z "${OPENAI_BASE_URL:-}" ]]; then
echo "[entrypoint] OPENAI_BASE_URL is unset. For GenSearcher **inside this Space only**, set Space variable"
echo "[entrypoint] START_VLLM_GENSEARCHER=1 (entrypoint will start vLLM here and set OPENAI_BASE_URL to loopback)."
fi
fi
wait_http() {
local url=$1
local name=$2
local max_attempts=${3:-90}
local i=0
echo "[entrypoint] Waiting for ${name} (${url})..."
until curl -sf "$url" >/dev/null 2>&1; do
i=$((i + 1))
if [[ $i -ge $max_attempts ]]; then
echo "[entrypoint] Timeout waiting for ${name}"
exit 1
fi
sleep 2
done
echo "[entrypoint] ${name} is up."
}
# Defaults: only FireRed + Gradio in-container. Point OPENAI_BASE_URL / BROWSE_SUMMARY_BASE_URL
# to your vLLM (or other OpenAI-compatible) endpoints via Space secrets.
# --- Optional local vLLM: GenSearcher-8B (OpenAI-compatible) ---
if [[ "${START_VLLM_GENSEARCHER:-0}" == "1" ]]; then
CUDA_VISIBLE_DEVICES="${GENSEARCHER_CUDA_VISIBLE_DEVICES:-0}" \
vllm serve "${GENSEARCHER_MODEL_ID:-GenSearcher/Gen-Searcher-8B}" \
--host 0.0.0.0 \
--port 8002 \
--tensor-parallel-size "${GENSEARCHER_TP:-1}" \
--gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
--served-model-name "${GEN_EVAL_MODEL:-Gen-Searcher-8B}" \
--max-model-len "${GENSEARCHER_MAX_MODEL_LEN:-65536}" \
--no-enable-prefix-caching &
wait_http "http://127.0.0.1:8002/v1/models" "GenSearcher vLLM"
export OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8002/v1}"
fi
# --- Optional local vLLM: browse summarization (Qwen3-VL) ---
if [[ "${START_VLLM_BROWSE:-0}" == "1" ]]; then
export BROWSE_GENERATE_ENGINE=vllm
CUDA_VISIBLE_DEVICES="${BROWSE_CUDA_VISIBLE_DEVICES:-1}" \
vllm serve "${BROWSE_MODEL_ID:-Qwen/Qwen3-VL-30B-A3B-Instruct}" \
--host 0.0.0.0 \
--port 8003 \
--tensor-parallel-size "${BROWSE_TP:-1}" \
--gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \
--served-model-name "${BROWSE_SUMMARY_MODEL:-Qwen3-VL-30B-A3B-Instruct}" \
--max-model-len "${BROWSE_MAX_MODEL_LEN:-65536}" \
--mm-processor-cache-gb 0 \
--no-enable-prefix-caching &
wait_http "http://127.0.0.1:8003/v1/models" "Browse-summary vLLM"
export BROWSE_SUMMARY_BASE_URL="${BROWSE_SUMMARY_BASE_URL:-http://127.0.0.1:8003/v1}"
fi
# --- FireRed adapter (GenSearcher /generate contract) ---
if [[ "${START_FIRERED_API:-1}" == "1" ]]; then
CUDA_VISIBLE_DEVICES="${FIRERED_CUDA_VISIBLE_DEVICES:-0}" \
python -m uvicorn services.firered_generate:app --host 0.0.0.0 --port 8765 &
wait_http "http://127.0.0.1:8765/health" "FireRed API" 120
export QWEN_EDIT_APP_URL="${QWEN_EDIT_APP_URL:-http://127.0.0.1:8765}"
else
echo "[entrypoint] START_FIRERED_API=0 — use external QWEN_EDIT_APP_URL for generation."
fi
exec python app.py