Spaces:
Sleeping
Sleeping
| # Launch the official llama.cpp server + the agent app (Docker GPU Space). | |
| # llama-server downloads the GGUF from HF on first run and serves it on :8080; | |
| # the app calls it via INFERENCE_BASE_URL=http://127.0.0.1:8080/v1. | |
| set -u | |
| # UI-only / preview mode: in stub mode there's no model, so skip llama-server | |
| # entirely (otherwise it would download the ~20GB GGUF and fail on a CPU box). | |
| # Lets the Space run the full UI for free on cpu-basic. See PLAN / docs. | |
| if [ "${USE_STUB_EXTRACTOR:-0}" = "1" ]; then | |
| echo "[start] UI-only (USE_STUB_EXTRACTOR=1) — skipping llama-server" | |
| exec python3 app.py | |
| fi | |
| LS="$(command -v llama-server || echo /app/llama-server)" | |
| # The official binary's sibling .so (libllama-server-impl.so) lives next to it in | |
| # /app; we run from /srv, so add its dir to the loader path. | |
| export LD_LIBRARY_PATH="$(dirname "$LS"):/app:${LD_LIBRARY_PATH:-}" | |
| echo "[start] using llama-server at: $LS (LD_LIBRARY_PATH=$LD_LIBRARY_PATH)" | |
| # Model selection: MODEL_FILE (explicit filename in MODEL_HF_REPO) is preferred — | |
| # the repo holds multiple Q4_K_M GGUFs (31B + E4B edge), so the `-hf repo:quant` | |
| # shorthand is ambiguous there. Falls back to -hf REPO:QUANT when MODEL_FILE unset. | |
| if [ -n "${MODEL_FILE:-}" ]; then | |
| echo "[start] model: ${MODEL_HF_REPO}/${MODEL_FILE} (explicit file; downloads on first run)" | |
| MODEL_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MODEL_HF_REPO}', '${MODEL_FILE}'))")" | |
| MODEL_ARGS="-m $MODEL_PATH" | |
| else | |
| echo "[start] model: ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M} (downloads on first run)" | |
| MODEL_ARGS="-hf ${MODEL_HF_REPO}:${MODEL_QUANT:-Q4_K_M}" | |
| fi | |
| # Vision: download the mmproj projector and pass --mmproj so llama-server accepts | |
| # image_url inputs (screenshots/flyers). MMPROJ_REPO lets the projector come from a | |
| # different repo than the LLM (the E4B edge model uses the base E4B's projector, | |
| # not the 31B mmproj stored alongside it). Falls back to text-only if unavailable. | |
| MMPROJ_ARG="" | |
| if [ -n "${MMPROJ_FILE:-}" ]; then | |
| MMPROJ_REPO="${MMPROJ_REPO:-$MODEL_HF_REPO}" | |
| echo "[start] fetching mmproj ${MMPROJ_REPO}/${MMPROJ_FILE} for vision..." | |
| MMPROJ_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${MMPROJ_REPO}', '${MMPROJ_FILE}'))" 2>/dev/null || true)" | |
| if [ -n "$MMPROJ_PATH" ]; then | |
| MMPROJ_ARG="--mmproj $MMPROJ_PATH" | |
| echo "[start] mmproj ready: $MMPROJ_PATH" | |
| else | |
| echo "[start] mmproj download failed -> text-only" | |
| fi | |
| fi | |
| # -ngl 999 offloads all layers to the GPU; --jinja enables the chat/tool template. | |
| "$LS" $MODEL_ARGS \ | |
| --host 127.0.0.1 --port 8080 \ | |
| -ngl 999 -c 8192 --jinja $MMPROJ_ARG & | |
| LLAMA_PID=$! | |
| # Optional second llama-server: the Agent tab's MiniCPM planner. OFF unless | |
| # PLANNER_HF_REPO+PLANNER_FILE are set. VRAM note: E4B Q4 (~5GB) + MiniCPM-8B | |
| # Q4 (~5GB) + KV is tight on a 16GB T4 — tune PLANNER_NGL (default 999; lower | |
| # it for partial offload, planner outputs are short) or use the 1B variant | |
| # (openbmb/MiniCPM5-1B-GGUF / MiniCPM5-1B-Q4_K_M.gguf). | |
| # PLANNER_CTX (default 8192, matching the main model): a multi-step agent run | |
| # accumulates the tool schemas + task + thread + each step's observations, so | |
| # 4096 overflows on real threads ("request (4142 tokens) exceeds context"). | |
| if [ -n "${PLANNER_HF_REPO:-}" ] && [ -n "${PLANNER_FILE:-}" ]; then | |
| echo "[start] planner: ${PLANNER_HF_REPO}/${PLANNER_FILE} on :${PLANNER_PORT:-8081}" | |
| PLANNER_PATH="$(python3 -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('${PLANNER_HF_REPO}', '${PLANNER_FILE}'))")" | |
| "$LS" -m "$PLANNER_PATH" \ | |
| --host 127.0.0.1 --port "${PLANNER_PORT:-8081}" \ | |
| -ngl "${PLANNER_NGL:-999}" -c "${PLANNER_CTX:-8192}" --jinja & | |
| echo "[start] planner launching (PLANNER_BASE_URL should be http://127.0.0.1:${PLANNER_PORT:-8081}/v1)" | |
| fi | |
| echo "[start] waiting for llama-server health (model download can take minutes)..." | |
| for i in $(seq 1 900); do | |
| if ! kill -0 "$LLAMA_PID" 2>/dev/null; then | |
| echo "[start] ERROR: llama-server exited early"; break | |
| fi | |
| if curl -sf http://127.0.0.1:8080/health >/dev/null 2>&1; then | |
| echo "[start] llama-server ready after ~$((i*2))s"; break | |
| fi | |
| sleep 2 | |
| done | |
| echo "[start] launching app (UI + /agent) -> INFERENCE_BASE_URL=$INFERENCE_BASE_URL" | |
| exec python3 app.py | |