OffGridSchedula / scripts /start_space.sh
ParetoOptimal's picture
Initial Commit
0366d65
Raw
History Blame Contribute Delete
4.36 kB
#!/usr/bin/env bash
# Launch the official llama.cpp server + the agent app (Docker GPU Space).
# llama-server downloads the GGUF from HF on first run and serves it on :8080;
# the app calls it via INFERENCE_BASE_URL=http://127.0.0.1:8080/v1.
set -u
# UI-only / preview mode: in stub mode there's no model, so skip llama-server
# entirely (otherwise it would download the ~20GB GGUF and fail on a CPU box).
# Lets the Space run the full UI for free on cpu-basic. See PLAN / docs.
if [ "${USE_STUB_EXTRACTOR:-0}" = "1" ]; then
echo "[start] UI-only (USE_STUB_EXTRACTOR=1) — skipping llama-server"
exec python3 app.py
fi
LS="$(command -v llama-server || echo /app/llama-server)"
# The official binary's sibling .so (libllama-server-impl.so) lives next to it in
# /app; we run from /srv, so add its dir to the loader path.
export LD_LIBRARY_PATH="$(dirname "$LS"):/app:${LD_LIBRARY_PATH:-}"
echo "[start] using llama-server at: $LS (LD_LIBRARY_PATH=$LD_LIBRARY_PATH)"
# Model selection: MODEL_FILE (explicit filename in MODEL_HF_REPO) is preferred —
# the repo holds multiple Q4_K_M GGUFs (31B + E4B edge), so the `-hf repo:quant`
# shorthand is ambiguous there. Falls back to -hf REPO:QUANT when MODEL_FILE unset.
if [ -n "${MODEL_FILE:-}" ]; then
echo "[start] model: ${MODEL_HF_REPO}/${MODEL_FILE} (explicit file; downloads on first run)"
MODEL_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MODEL_HF_REPO}', '${MODEL_FILE}'))")"
MODEL_ARGS="-m $MODEL_PATH"
else
echo "[start] model: ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M} (downloads on first run)"
MODEL_ARGS="-hf ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M}"
fi
# Vision: download the mmproj projector and pass --mmproj so llama-server accepts
# image_url inputs (screenshots/flyers). MMPROJ_REPO lets the projector come from a
# different repo than the LLM (the E4B edge model uses the base E4B's projector,
# not the 31B mmproj stored alongside it). Falls back to text-only if unavailable.
MMPROJ_ARG=""
if [ -n "${MMPROJ_FILE:-}" ]; then
MMPROJ_REPO="${MMPROJ_REPO:-$MODEL_HF_REPO}"
echo "[start] fetching mmproj ${MMPROJ_REPO}/${MMPROJ_FILE} for vision..."
MMPROJ_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MMPROJ_REPO}', '${MMPROJ_FILE}'))" 2>/dev/null || true)"
if [ -n "$MMPROJ_PATH" ]; then
MMPROJ_ARG="--mmproj $MMPROJ_PATH"
echo "[start] mmproj ready: $MMPROJ_PATH"
else
echo "[start] mmproj download failed -> text-only"
fi
fi
# -ngl 999 offloads all layers to the GPU; --jinja enables the chat/tool template.
"$LS" $MODEL_ARGS \
--host 127.0.0.1 --port 8080 \
-ngl 999 -c 8192 --jinja $MMPROJ_ARG &
LLAMA_PID=$!
# Optional second llama-server: the Agent tab's MiniCPM planner. OFF unless
# PLANNER_HF_REPO+PLANNER_FILE are set. VRAM note: E4B Q4 (~5GB) + MiniCPM-8B
# Q4 (~5GB) + KV is tight on a 16GB T4 — tune PLANNER_NGL (default 999; lower
# it for partial offload, planner outputs are short) or use the 1B variant
# (openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf).
# PLANNER_CTX (default 8192, matching the main model): a multi-step agent run
# accumulates the tool schemas + task + thread + each step's observations, so
# 4096 overflows on real threads ("request (4142 tokens) exceeds context").
if [ -n "${PLANNER_HF_REPO:-}" ] && [ -n "${PLANNER_FILE:-}" ]; then
echo "[start] planner: ${PLANNER_HF_REPO}/${PLANNER_FILE} on :${PLANNER_PORT:-8081}"
PLANNER_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${PLANNER_HF_REPO}', '${PLANNER_FILE}'))")"
"$LS" -m "$PLANNER_PATH" \
--host 127.0.0.1 --port "${PLANNER_PORT:-8081}" \
-ngl "${PLANNER_NGL:-999}" -c "${PLANNER_CTX:-8192}" --jinja &
echo "[start] planner launching (PLANNER_BASE_URL should be http://127.0.0.1:${PLANNER_PORT:-8081}/v1)"
fi
echo "[start] waiting for llama-server health (model download can take minutes)..."
for i in $(seq 1 900); do
if ! kill -0 "$LLAMA_PID" 2>/dev/null; then
echo "[start] ERROR: llama-server exited early"; break
fi
if curl -sf http://127.0.0.1:8080/health >/dev/null 2>&1; then
echo "[start] llama-server ready after ~$((i*2))s"; break
fi
sleep 2
done
echo "[start] launching app (UI + /agent) -> INFERENCE_BASE_URL=$INFERENCE_BASE_URL"
exec python3 app.py