#!/usr/bin/env bash # Launch The Wizard's Oracles. # # Default backend: Modal-hosted vLLM + LoRA on an L40S in the cloud. # Optional backend (--local-llama): llama.cpp running on YOUR machine, # fully offline — see below. # # Flags: # --base-model Request bare base Qwen (no LoRA) from whichever # backend is running. Equivalent to # ORACLES_LLM_MODEL=llm. # --keep-warm (default) Leave the Modal LLM running on exit # so the deployed HF Space can still reach it. # Ignored in --local-llama mode. # --stop-on-exit Inverse of --keep-warm: stop the Modal app when # this script exits. Useful only if you know no # other clients (HF Space, teammates) are using it. # --local-llama Run llama-cpp-python locally on a GGUF copy of # the model — no Modal calls, no cloud. Requires # the .gguf file already on disk (see env var # ORACLES_GGUF_PATH below). # --save-trace [DIR] (default ON) Append every LLM request/response to # a JSONL file under DIR (default: ./traces/). Drop # the file into an HF dataset repo for the # Sharing-is-Caring badge. # --no-trace Disable trace writing for this run (sets # ORACLES_TRACE_DISABLE=1). # --full Enable ALL decorative PNGs (parallax banner, # parchment, phase backdrops, scene landscapes, # wizard-desk, open-book, demo-card backdrop). # Recommended for local runs on fast connections. # --lean (default) Skip the heavy decorative PNGs. Used by # the HF Space deployment because its egress # bandwidth (~15 KB/s) makes the full set add # multi-megabytes to every cold load. # # Env vars: # ORACLES_LLM_MODEL=llm request base model # ORACLES_LLM_MODEL=oracle-wizard-lora (default) request fine-tune # ORACLES_VISUAL_MODE=full same as --full # ORACLES_VISUAL_MODE=lean (default) same as --lean # KEEP_LLM_WARM=1 keep Modal LLM running # ORACLES_GGUF_PATH=/path/to/model.gguf used in --local-llama mode # ORACLES_LOCAL_LLAMA_PORT=8080 (default) llama-server port # # Default-backend prereqs (.env.local or shell): # MODAL_URL, MODAL_KEY, MODAL_SECRET — set by `modal setup` + proxy tokens. # # Local-backend prereqs: # .venv/bin/pip install 'llama-cpp-python[server]' # modal volume get oracles-lora-ckpts /gguf ./gguf-out (one-time download) # ORACLES_GGUF_PATH=./gguf-out/oracles-wizard-14b-q4_k_m.gguf set -e cd "$(dirname "$0")" # ----------------------------------------------------------------------------- # Parse our own flags (everything else gets passed through to app.py) # ----------------------------------------------------------------------------- # Default to keep-warm so the deployed HF Space (which shares the same # Modal endpoint) doesn't get an APIConnectionError every time a local # run exits. Pass --stop-on-exit to opt out and stop billing. KEEP_WARM="${KEEP_LLM_WARM:-1}" USE_BASE_MODEL="0" USE_LOCAL_LLAMA="0" SAVE_TRACE_DIR="" APP_ARGS=() for arg in "$@"; do case "$arg" in --base-model) USE_BASE_MODEL="1" ;; --keep-warm) KEEP_WARM="1" ;; # back-compat noop --stop-on-exit) KEEP_WARM="0" ;; --local-llama) USE_LOCAL_LLAMA="1" ;; --save-trace) SAVE_TRACE_DIR="./traces" ;; --save-trace=*) SAVE_TRACE_DIR="${arg#--save-trace=}" ;; --no-trace) export ORACLES_TRACE_DISABLE="1" ;; --full) export ORACLES_VISUAL_MODE="full" ;; --lean) export ORACLES_VISUAL_MODE="lean" ;; *) APP_ARGS+=("$arg") ;; esac done if [ -n "$SAVE_TRACE_DIR" ]; then mkdir -p "$SAVE_TRACE_DIR" export ORACLES_TRACE_DIR="$SAVE_TRACE_DIR" echo "[run.sh] --save-trace: appending LLM exchanges to $SAVE_TRACE_DIR/oracles-trace-.jsonl" fi # Visual-mode banner. The app defaults to lean; --full overrides for local # bandwidth-rich runs that want the parallax banner, parchment texture, # phase backdrops, scene landscapes, etc. if [ "${ORACLES_VISUAL_MODE:-lean}" = "full" ]; then echo "[run.sh] --full: ORACLES_VISUAL_MODE=full (all PNGs / textures / backdrops enabled)" else echo "[run.sh] lean mode (default) — pass --full to enable all visuals" fi # ----------------------------------------------------------------------------- # Load .env.local — look in both the project root and oracles_app/ so # shared credentials are picked up. # ----------------------------------------------------------------------------- for env_file in "../.env.local" ".env.local"; do if [ -f "$env_file" ]; then set -a; . "$env_file"; set +a fi done # Skip Gradio's import-time analytics + HuggingFace probe — both can hang on # old SSL stacks. Also keeps boot fast. export GRADIO_ANALYTICS_ENABLED=${GRADIO_ANALYTICS_ENABLED:-0} export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1} # Pick a Python. if [ -x "../.venv/bin/python" ]; then PY="../.venv/bin/python" elif [ -x ".venv/bin/python" ]; then PY=".venv/bin/python" else PY="python3"; fi "$PY" -c "import gradio, openai" 2>/dev/null || { echo "Missing dependencies. Install with: $PY -m pip install -r requirements.txt" exit 1 } # ----------------------------------------------------------------------------- # Decide which served model the app will request. # ----------------------------------------------------------------------------- if [ "$USE_BASE_MODEL" = "1" ]; then export ORACLES_LLM_MODEL="llm" echo "[run.sh] --base-model: requesting bare base Qwen2.5-14B (no LoRA)." else export ORACLES_LLM_MODEL="${ORACLES_LLM_MODEL:-oracle-wizard-lora}" echo "[run.sh] Requesting fine-tune: $ORACLES_LLM_MODEL" fi # ============================================================================= # BACKEND BRANCH 1 — Local llama.cpp (--local-llama) # Runs llama-cpp-python's OpenAI-compatible server in the background. The # Gradio app sees it as just another endpoint via MODAL_URL=http://localhost. # ============================================================================= if [ "$USE_LOCAL_LLAMA" = "1" ]; then GGUF_PATH="${ORACLES_GGUF_PATH:-./gguf-out/oracles-wizard-14b-q4_k_m.gguf}" LL_PORT="${ORACLES_LOCAL_LLAMA_PORT:-8080}" if [ ! -f "$GGUF_PATH" ]; then cat <&2 ERROR: GGUF file not found at $GGUF_PATH To use --local-llama you first need to download the quantized GGUF: modal volume get oracles-lora-ckpts /gguf ./gguf-out Then either: ./run.sh --local-llama (which expects ./gguf-out/oracles-wizard-14b-q4_k_m.gguf by default) Or set the path explicitly: ORACLES_GGUF_PATH=/abs/path/to/model.gguf ./run.sh --local-llama EOF exit 1 fi if ! "$PY" -c "import llama_cpp.server" 2>/dev/null; then echo "ERROR: llama-cpp-python's server module isn't installed." >&2 echo " Install with: $PY -m pip install 'llama-cpp-python[server]'" >&2 exit 1 fi echo "[run.sh] --local-llama: starting llama_cpp.server in the background" echo " model = $GGUF_PATH" echo " port = $LL_PORT" # Start llama-cpp's OpenAI-compatible server. On Apple Silicon we want # n_gpu_layers=-1 so the Metal backend takes the whole model. LL_PID_FILE=$(mktemp -t oracles_local_llama_pid.XXXXXX) "$PY" -m llama_cpp.server \ --model "$GGUF_PATH" \ --host 127.0.0.1 \ --port "$LL_PORT" \ --n_gpu_layers -1 \ --n_ctx 8192 \ --model_alias oracle-wizard-lora \ > /tmp/oracles_local_llama.log 2>&1 & LL_PID=$! echo "$LL_PID" > "$LL_PID_FILE" echo "[run.sh] llama_cpp.server PID=$LL_PID (log: /tmp/oracles_local_llama.log)" # Cleanup trap: kill local server on exit cleanup_local() { local rc=$? if kill -0 "$LL_PID" 2>/dev/null; then echo "" echo "[run.sh] Stopping local llama-cpp server (PID=$LL_PID)..." kill "$LL_PID" 2>/dev/null || true wait "$LL_PID" 2>/dev/null || true fi rm -f "$LL_PID_FILE" exit "$rc" } trap cleanup_local EXIT INT TERM # Wait for the server to load the model. 14B Q4 on M-series ~15-40s. echo "[run.sh] Waiting for local server to load the model..." HEALTHY=0 for i in $(seq 1 120); do # 120 * 2s = 4 min max if curl -s --max-time 2 "http://127.0.0.1:$LL_PORT/v1/models" \ | grep -q '"id"'; then echo "[run.sh] Local server ready (took $((i * 2))s)." HEALTHY=1 break fi if ! kill -0 "$LL_PID" 2>/dev/null; then echo "ERROR: llama_cpp.server died early. Check /tmp/oracles_local_llama.log" exit 1 fi if [ $((i % 5)) -eq 0 ]; then echo " ... still loading ($((i * 2))s)" fi sleep 2 done if [ "$HEALTHY" = "0" ]; then echo "ERROR: local server never became ready. See /tmp/oracles_local_llama.log" exit 1 fi # Point the app at the local endpoint. The existing LLMClient already # speaks OpenAI's protocol so zero client code changes are needed. export MODAL_URL="http://127.0.0.1:$LL_PORT" export MODAL_KEY="local" # any non-empty value — local server ignores export MODAL_SECRET="local" export ORACLES_FORCE_MOCK=0 echo "[run.sh] App pointed at $MODAL_URL — no Modal calls will be made." echo "" "$PY" app.py "${APP_ARGS[@]}" exit 0 fi # ============================================================================= # BACKEND BRANCH 2 — Modal vLLM (default) # ============================================================================= LLM_APP_NAME="forest-focus-llm" REPO_ROOT="$(cd .. && pwd)" LLM_SCRIPT="$REPO_ROOT/modal_backend/modal_llm.py" if [ -z "${MODAL_KEY:-}" ] || [ -z "${MODAL_SECRET:-}" ]; then cat <<'EOF' >&2 ERROR: MODAL_KEY and MODAL_SECRET are required for the default Modal backend. For the fully-offline local backend (no Modal needed), run with --local-llama after downloading the GGUF: modal volume get oracles-lora-ckpts /gguf ./gguf-out ./run.sh --local-llama To use the default Modal backend, one-time setup: 1. modal deploy modal_backend/modal_llm.py 2. https://modal.com/settings/proxy-auth-tokens → Create 3. Add to oracles_app/.env.local: MODAL_KEY=wk-xxxxxxxxx MODAL_SECRET=ws-xxxxxxxxx 4. ./run.sh To swap the fine-tune for the bare base model: ./run.sh --base-model EOF exit 1 fi if ! command -v modal >/dev/null 2>&1; then echo "ERROR: 'modal' CLI not found on PATH. Install: pip install modal" >&2 exit 1 fi if [ ! -f "$LLM_SCRIPT" ]; then echo "ERROR: LLM script not found at $LLM_SCRIPT" >&2 exit 1 fi # Resolve the workspace name — the Modal proxy URL is determined by it. WORKSPACE=$(modal profile current 2>/dev/null | head -1 | tr -d '[:space:]') if [ -z "$WORKSPACE" ]; then echo "ERROR: could not determine Modal workspace. Run 'modal setup' first." exit 1 fi LLM_URL="https://${WORKSPACE}--${LLM_APP_NAME}-serve.modal.run" echo "[run.sh] Workspace: $WORKSPACE" echo "[run.sh] LLM URL: $LLM_URL" # ----------------------------------------------------------------------------- # Deploy if not already deployed. # ----------------------------------------------------------------------------- echo "[run.sh] Ensuring $LLM_APP_NAME is deployed..." if ! modal deploy "$LLM_SCRIPT" 2>&1 | tee /tmp/oracles_modal_deploy.log; then echo "ERROR: modal deploy failed. See /tmp/oracles_modal_deploy.log" exit 1 fi PRINTED_URL=$(grep -oE 'https://[A-Za-z0-9.-]+\.modal\.run' /tmp/oracles_modal_deploy.log | head -1 || true) if [ -n "$PRINTED_URL" ]; then LLM_URL="$PRINTED_URL" echo "[run.sh] Using URL from deploy output: $LLM_URL" fi # ----------------------------------------------------------------------------- # Cleanup trap — stop the Modal app when we exit (unless --keep-warm). # ----------------------------------------------------------------------------- cleanup() { local rc=$? if [ "$KEEP_WARM" = "1" ]; then echo "" echo "[run.sh] --keep-warm set; leaving $LLM_APP_NAME running on Modal." else echo "" echo "[run.sh] Stopping $LLM_APP_NAME so the L40S stops billing..." modal app stop --yes "$LLM_APP_NAME" 2>/dev/null || true echo "[run.sh] Stopped." fi exit "$rc" } trap cleanup EXIT INT TERM # ----------------------------------------------------------------------------- # Wait for the endpoint. # ----------------------------------------------------------------------------- echo "[run.sh] Waiting for endpoint to become healthy (up to 10 minutes)..." HEALTHY=0 SAW_LORA=0 for i in $(seq 1 120); do RESPONSE=$(curl -s \ -H "Modal-Key: $MODAL_KEY" \ -H "Modal-Secret: $MODAL_SECRET" \ --max-time 5 \ "${LLM_URL}/v1/models" 2>/dev/null || true) if echo "$RESPONSE" | grep -q '"id"'; then echo "[run.sh] LLM ready (took $((i * 5))s)." HEALTHY=1 if echo "$RESPONSE" | grep -q "oracle-wizard-lora"; then SAW_LORA=1 echo "[run.sh] ✓ Fine-tune adapter 'oracle-wizard-lora' is served." fi break fi if [ $((i % 6)) -eq 0 ]; then echo " ... still waiting ($((i * 5))s)" fi sleep 5 done if [ "$HEALTHY" = "0" ]; then echo "WARN: endpoint never returned a model list within 10 minutes." elif [ "$USE_BASE_MODEL" = "0" ] && [ "$SAW_LORA" = "0" ]; then echo "WARN: endpoint is up but did NOT advertise 'oracle-wizard-lora'." fi # ----------------------------------------------------------------------------- # Hand off to the app. # ----------------------------------------------------------------------------- export MODAL_URL="$LLM_URL" export ORACLES_FORCE_MOCK=0 echo "" echo "============================================================" echo " Open in your browser: http://localhost:7860" echo " (NOT http://0.0.0.0:7860 — Chrome blocks that by default)" echo "============================================================" echo "" "$PY" app.py "${APP_ARGS[@]}"