#!/usr/bin/env bash # Launch the official llama.cpp server + the agent app (Docker GPU Space). # llama-server downloads the GGUF from HF on first run and serves it on :8080; # the app calls it via INFERENCE_BASE_URL=http://127.0.0.1:8080/v1. set -u # UI-only / preview mode: in stub mode there's no model, so skip llama-server # entirely (otherwise it would download the ~20GB GGUF and fail on a CPU box). # Lets the Space run the full UI for free on cpu-basic. See PLAN / docs. if [ "${USE_STUB_EXTRACTOR:-0}" = "1" ]; then echo "[start] UI-only (USE_STUB_EXTRACTOR=1) — skipping llama-server" exec python3 app.py fi LS="$(command -v llama-server || echo /app/llama-server)" # The official binary's sibling .so (libllama-server-impl.so) lives next to it in # /app; we run from /srv, so add its dir to the loader path. export LD_LIBRARY_PATH="$(dirname "$LS"):/app:${LD_LIBRARY_PATH:-}" echo "[start] using llama-server at: $LS (LD_LIBRARY_PATH=$LD_LIBRARY_PATH)" # Model selection: MODEL_FILE (explicit filename in MODEL_HF_REPO) is preferred — # the repo holds multiple Q4_K_M GGUFs (31B + E4B edge), so the `-hf repo:quant` # shorthand is ambiguous there. Falls back to -hf REPO:QUANT when MODEL_FILE unset. if [ -n "${MODEL_FILE:-}" ]; then echo "[start] model: ${MODEL_HF_REPO}/${MODEL_FILE} (explicit file; downloads on first run)" MODEL_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MODEL_HF_REPO}', '${MODEL_FILE}'))")" MODEL_ARGS="-m $MODEL_PATH" else echo "[start] model: ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M} (downloads on first run)" MODEL_ARGS="-hf ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M}" fi # Vision: download the mmproj projector and pass --mmproj so llama-server accepts # image_url inputs (screenshots/flyers). MMPROJ_REPO lets the projector come from a # different repo than the LLM (the E4B edge model uses the base E4B's projector, # not the 31B mmproj stored alongside it). Falls back to text-only if unavailable. MMPROJ_ARG="" if [ -n "${MMPROJ_FILE:-}" ]; then MMPROJ_REPO="${MMPROJ_REPO:-$MODEL_HF_REPO}" echo "[start] fetching mmproj ${MMPROJ_REPO}/${MMPROJ_FILE} for vision..." MMPROJ_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MMPROJ_REPO}', '${MMPROJ_FILE}'))" 2>/dev/null || true)" if [ -n "$MMPROJ_PATH" ]; then MMPROJ_ARG="--mmproj $MMPROJ_PATH" echo "[start] mmproj ready: $MMPROJ_PATH" else echo "[start] mmproj download failed -> text-only" fi fi # -ngl 999 offloads all layers to the GPU; --jinja enables the chat/tool template. "$LS" $MODEL_ARGS \ --host 127.0.0.1 --port 8080 \ -ngl 999 -c 8192 --jinja $MMPROJ_ARG & LLAMA_PID=$! # Optional second llama-server: the Agent tab's MiniCPM planner. OFF unless # PLANNER_HF_REPO+PLANNER_FILE are set. VRAM note: E4B Q4 (~5GB) + MiniCPM-8B # Q4 (~5GB) + KV is tight on a 16GB T4 — tune PLANNER_NGL (default 999; lower # it for partial offload, planner outputs are short) or use the 1B variant # (openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf). # PLANNER_CTX (default 8192, matching the main model): a multi-step agent run # accumulates the tool schemas + task + thread + each step's observations, so # 4096 overflows on real threads ("request (4142 tokens) exceeds context"). if [ -n "${PLANNER_HF_REPO:-}" ] && [ -n "${PLANNER_FILE:-}" ]; then echo "[start] planner: ${PLANNER_HF_REPO}/${PLANNER_FILE} on :${PLANNER_PORT:-8081}" PLANNER_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${PLANNER_HF_REPO}', '${PLANNER_FILE}'))")" "$LS" -m "$PLANNER_PATH" \ --host 127.0.0.1 --port "${PLANNER_PORT:-8081}" \ -ngl "${PLANNER_NGL:-999}" -c "${PLANNER_CTX:-8192}" --jinja & echo "[start] planner launching (PLANNER_BASE_URL should be http://127.0.0.1:${PLANNER_PORT:-8081}/v1)" fi echo "[start] waiting for llama-server health (model download can take minutes)..." for i in $(seq 1 900); do if ! kill -0 "$LLAMA_PID" 2>/dev/null; then echo "[start] ERROR: llama-server exited early"; break fi if curl -sf http://127.0.0.1:8080/health >/dev/null 2>&1; then echo "[start] llama-server ready after ~$((i*2))s"; break fi sleep 2 done echo "[start] launching app (UI + /agent) -> INFERENCE_BASE_URL=$INFERENCE_BASE_URL" exec python3 app.py