the-apprentice / run.sh
AndrewRqy
Clean-Space pass: drop self-test, legacy unused handler, BISECT_MINIMAL branch, duplicate banner PNG, stale comments
f9490c0
Raw
History Blame Contribute Delete
14.7 kB
#!/usr/bin/env bash
# Launch The Wizard's Oracles.
#
# Default backend: Modal-hosted vLLM + LoRA on an L40S in the cloud.
# Optional backend (--local-llama): llama.cpp running on YOUR machine,
# fully offline — see below.
#
# Flags:
# --base-model Request bare base Qwen (no LoRA) from whichever
# backend is running. Equivalent to
# ORACLES_LLM_MODEL=llm.
# --keep-warm (default) Leave the Modal LLM running on exit
# so the deployed HF Space can still reach it.
# Ignored in --local-llama mode.
# --stop-on-exit Inverse of --keep-warm: stop the Modal app when
# this script exits. Useful only if you know no
# other clients (HF Space, teammates) are using it.
# --local-llama Run llama-cpp-python locally on a GGUF copy of
# the model — no Modal calls, no cloud. Requires
# the .gguf file already on disk (see env var
# ORACLES_GGUF_PATH below).
# --save-trace [DIR] (default ON) Append every LLM request/response to
# a JSONL file under DIR (default: ./traces/). Drop
# the file into an HF dataset repo for the
# Sharing-is-Caring badge.
# --no-trace Disable trace writing for this run (sets
# ORACLES_TRACE_DISABLE=1).
# --full Enable ALL decorative PNGs (parallax banner,
# parchment, phase backdrops, scene landscapes,
# wizard-desk, open-book, demo-card backdrop).
# Recommended for local runs on fast connections.
# --lean (default) Skip the heavy decorative PNGs. Used by
# the HF Space deployment because its egress
# bandwidth (~15 KB/s) makes the full set add
# multi-megabytes to every cold load.
#
# Env vars:
# ORACLES_LLM_MODEL=llm request base model
# ORACLES_LLM_MODEL=oracle-wizard-lora (default) request fine-tune
# ORACLES_VISUAL_MODE=full same as --full
# ORACLES_VISUAL_MODE=lean (default) same as --lean
# KEEP_LLM_WARM=1 keep Modal LLM running
# ORACLES_GGUF_PATH=/path/to/model.gguf used in --local-llama mode
# ORACLES_LOCAL_LLAMA_PORT=8080 (default) llama-server port
#
# Default-backend prereqs (.env.local or shell):
# MODAL_URL, MODAL_KEY, MODAL_SECRET — set by `modal setup` + proxy tokens.
#
# Local-backend prereqs:
# .venv/bin/pip install 'llama-cpp-python[server]'
# modal volume get oracles-lora-ckpts /gguf ./gguf-out (one-time download)
# ORACLES_GGUF_PATH=./gguf-out/oracles-wizard-14b-q4_k_m.gguf
set -e
cd "$(dirname "$0")"
# -----------------------------------------------------------------------------
# Parse our own flags (everything else gets passed through to app.py)
# -----------------------------------------------------------------------------
# Default to keep-warm so the deployed HF Space (which shares the same
# Modal endpoint) doesn't get an APIConnectionError every time a local
# run exits. Pass --stop-on-exit to opt out and stop billing.
KEEP_WARM="${KEEP_LLM_WARM:-1}"
USE_BASE_MODEL="0"
USE_LOCAL_LLAMA="0"
SAVE_TRACE_DIR=""
APP_ARGS=()
for arg in "$@"; do
case "$arg" in
--base-model) USE_BASE_MODEL="1" ;;
--keep-warm) KEEP_WARM="1" ;; # back-compat noop
--stop-on-exit) KEEP_WARM="0" ;;
--local-llama) USE_LOCAL_LLAMA="1" ;;
--save-trace) SAVE_TRACE_DIR="./traces" ;;
--save-trace=*) SAVE_TRACE_DIR="${arg#--save-trace=}" ;;
--no-trace) export ORACLES_TRACE_DISABLE="1" ;;
--full) export ORACLES_VISUAL_MODE="full" ;;
--lean) export ORACLES_VISUAL_MODE="lean" ;;
*) APP_ARGS+=("$arg") ;;
esac
done
if [ -n "$SAVE_TRACE_DIR" ]; then
mkdir -p "$SAVE_TRACE_DIR"
export ORACLES_TRACE_DIR="$SAVE_TRACE_DIR"
echo "[run.sh] --save-trace: appending LLM exchanges to $SAVE_TRACE_DIR/oracles-trace-<session>.jsonl"
fi
# Visual-mode banner. The app defaults to lean; --full overrides for local
# bandwidth-rich runs that want the parallax banner, parchment texture,
# phase backdrops, scene landscapes, etc.
if [ "${ORACLES_VISUAL_MODE:-lean}" = "full" ]; then
echo "[run.sh] --full: ORACLES_VISUAL_MODE=full (all PNGs / textures / backdrops enabled)"
else
echo "[run.sh] lean mode (default) — pass --full to enable all visuals"
fi
# -----------------------------------------------------------------------------
# Load .env.local — look in both the project root and oracles_app/ so
# shared credentials are picked up.
# -----------------------------------------------------------------------------
for env_file in "../.env.local" ".env.local"; do
if [ -f "$env_file" ]; then
set -a; . "$env_file"; set +a
fi
done
# Skip Gradio's import-time analytics + HuggingFace probe — both can hang on
# old SSL stacks. Also keeps boot fast.
export GRADIO_ANALYTICS_ENABLED=${GRADIO_ANALYTICS_ENABLED:-0}
export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1}
# Pick a Python.
if [ -x "../.venv/bin/python" ]; then PY="../.venv/bin/python"
elif [ -x ".venv/bin/python" ]; then PY=".venv/bin/python"
else PY="python3"; fi
"$PY" -c "import gradio, openai" 2>/dev/null || {
echo "Missing dependencies. Install with: $PY -m pip install -r requirements.txt"
exit 1
}
# -----------------------------------------------------------------------------
# Decide which served model the app will request.
# -----------------------------------------------------------------------------
if [ "$USE_BASE_MODEL" = "1" ]; then
export ORACLES_LLM_MODEL="llm"
echo "[run.sh] --base-model: requesting bare base Qwen2.5-14B (no LoRA)."
else
export ORACLES_LLM_MODEL="${ORACLES_LLM_MODEL:-oracle-wizard-lora}"
echo "[run.sh] Requesting fine-tune: $ORACLES_LLM_MODEL"
fi
# =============================================================================
# BACKEND BRANCH 1 — Local llama.cpp (--local-llama)
# Runs llama-cpp-python's OpenAI-compatible server in the background. The
# Gradio app sees it as just another endpoint via MODAL_URL=http://localhost.
# =============================================================================
if [ "$USE_LOCAL_LLAMA" = "1" ]; then
GGUF_PATH="${ORACLES_GGUF_PATH:-./gguf-out/oracles-wizard-14b-q4_k_m.gguf}"
LL_PORT="${ORACLES_LOCAL_LLAMA_PORT:-8080}"
if [ ! -f "$GGUF_PATH" ]; then
cat <<EOF >&2
ERROR: GGUF file not found at $GGUF_PATH
To use --local-llama you first need to download the quantized GGUF:
modal volume get oracles-lora-ckpts /gguf ./gguf-out
Then either:
./run.sh --local-llama
(which expects ./gguf-out/oracles-wizard-14b-q4_k_m.gguf by default)
Or set the path explicitly:
ORACLES_GGUF_PATH=/abs/path/to/model.gguf ./run.sh --local-llama
EOF
exit 1
fi
if ! "$PY" -c "import llama_cpp.server" 2>/dev/null; then
echo "ERROR: llama-cpp-python's server module isn't installed." >&2
echo " Install with: $PY -m pip install 'llama-cpp-python[server]'" >&2
exit 1
fi
echo "[run.sh] --local-llama: starting llama_cpp.server in the background"
echo " model = $GGUF_PATH"
echo " port = $LL_PORT"
# Start llama-cpp's OpenAI-compatible server. On Apple Silicon we want
# n_gpu_layers=-1 so the Metal backend takes the whole model.
LL_PID_FILE=$(mktemp -t oracles_local_llama_pid.XXXXXX)
"$PY" -m llama_cpp.server \
--model "$GGUF_PATH" \
--host 127.0.0.1 \
--port "$LL_PORT" \
--n_gpu_layers -1 \
--n_ctx 8192 \
--model_alias oracle-wizard-lora \
> /tmp/oracles_local_llama.log 2>&1 &
LL_PID=$!
echo "$LL_PID" > "$LL_PID_FILE"
echo "[run.sh] llama_cpp.server PID=$LL_PID (log: /tmp/oracles_local_llama.log)"
# Cleanup trap: kill local server on exit
cleanup_local() {
local rc=$?
if kill -0 "$LL_PID" 2>/dev/null; then
echo ""
echo "[run.sh] Stopping local llama-cpp server (PID=$LL_PID)..."
kill "$LL_PID" 2>/dev/null || true
wait "$LL_PID" 2>/dev/null || true
fi
rm -f "$LL_PID_FILE"
exit "$rc"
}
trap cleanup_local EXIT INT TERM
# Wait for the server to load the model. 14B Q4 on M-series ~15-40s.
echo "[run.sh] Waiting for local server to load the model..."
HEALTHY=0
for i in $(seq 1 120); do # 120 * 2s = 4 min max
if curl -s --max-time 2 "http://127.0.0.1:$LL_PORT/v1/models" \
| grep -q '"id"'; then
echo "[run.sh] Local server ready (took $((i * 2))s)."
HEALTHY=1
break
fi
if ! kill -0 "$LL_PID" 2>/dev/null; then
echo "ERROR: llama_cpp.server died early. Check /tmp/oracles_local_llama.log"
exit 1
fi
if [ $((i % 5)) -eq 0 ]; then
echo " ... still loading ($((i * 2))s)"
fi
sleep 2
done
if [ "$HEALTHY" = "0" ]; then
echo "ERROR: local server never became ready. See /tmp/oracles_local_llama.log"
exit 1
fi
# Point the app at the local endpoint. The existing LLMClient already
# speaks OpenAI's protocol so zero client code changes are needed.
export MODAL_URL="http://127.0.0.1:$LL_PORT"
export MODAL_KEY="local" # any non-empty value — local server ignores
export MODAL_SECRET="local"
export ORACLES_FORCE_MOCK=0
echo "[run.sh] App pointed at $MODAL_URL — no Modal calls will be made."
echo ""
"$PY" app.py "${APP_ARGS[@]}"
exit 0
fi
# =============================================================================
# BACKEND BRANCH 2 — Modal vLLM (default)
# =============================================================================
LLM_APP_NAME="forest-focus-llm"
REPO_ROOT="$(cd .. && pwd)"
LLM_SCRIPT="$REPO_ROOT/modal_backend/modal_llm.py"
if [ -z "${MODAL_KEY:-}" ] || [ -z "${MODAL_SECRET:-}" ]; then
cat <<'EOF' >&2
ERROR: MODAL_KEY and MODAL_SECRET are required for the default Modal backend.
For the fully-offline local backend (no Modal needed), run with --local-llama
after downloading the GGUF:
modal volume get oracles-lora-ckpts /gguf ./gguf-out
./run.sh --local-llama
To use the default Modal backend, one-time setup:
1. modal deploy modal_backend/modal_llm.py
2. https://modal.com/settings/proxy-auth-tokens → Create
3. Add to oracles_app/.env.local:
MODAL_KEY=wk-xxxxxxxxx
MODAL_SECRET=ws-xxxxxxxxx
4. ./run.sh
To swap the fine-tune for the bare base model: ./run.sh --base-model
EOF
exit 1
fi
if ! command -v modal >/dev/null 2>&1; then
echo "ERROR: 'modal' CLI not found on PATH. Install: pip install modal" >&2
exit 1
fi
if [ ! -f "$LLM_SCRIPT" ]; then
echo "ERROR: LLM script not found at $LLM_SCRIPT" >&2
exit 1
fi
# Resolve the workspace name — the Modal proxy URL is determined by it.
WORKSPACE=$(modal profile current 2>/dev/null | head -1 | tr -d '[:space:]')
if [ -z "$WORKSPACE" ]; then
echo "ERROR: could not determine Modal workspace. Run 'modal setup' first."
exit 1
fi
LLM_URL="https://${WORKSPACE}--${LLM_APP_NAME}-serve.modal.run"
echo "[run.sh] Workspace: $WORKSPACE"
echo "[run.sh] LLM URL: $LLM_URL"
# -----------------------------------------------------------------------------
# Deploy if not already deployed.
# -----------------------------------------------------------------------------
echo "[run.sh] Ensuring $LLM_APP_NAME is deployed..."
if ! modal deploy "$LLM_SCRIPT" 2>&1 | tee /tmp/oracles_modal_deploy.log; then
echo "ERROR: modal deploy failed. See /tmp/oracles_modal_deploy.log"
exit 1
fi
PRINTED_URL=$(grep -oE 'https://[A-Za-z0-9.-]+\.modal\.run' /tmp/oracles_modal_deploy.log | head -1 || true)
if [ -n "$PRINTED_URL" ]; then
LLM_URL="$PRINTED_URL"
echo "[run.sh] Using URL from deploy output: $LLM_URL"
fi
# -----------------------------------------------------------------------------
# Cleanup trap — stop the Modal app when we exit (unless --keep-warm).
# -----------------------------------------------------------------------------
cleanup() {
local rc=$?
if [ "$KEEP_WARM" = "1" ]; then
echo ""
echo "[run.sh] --keep-warm set; leaving $LLM_APP_NAME running on Modal."
else
echo ""
echo "[run.sh] Stopping $LLM_APP_NAME so the L40S stops billing..."
modal app stop --yes "$LLM_APP_NAME" 2>/dev/null || true
echo "[run.sh] Stopped."
fi
exit "$rc"
}
trap cleanup EXIT INT TERM
# -----------------------------------------------------------------------------
# Wait for the endpoint.
# -----------------------------------------------------------------------------
echo "[run.sh] Waiting for endpoint to become healthy (up to 10 minutes)..."
HEALTHY=0
SAW_LORA=0
for i in $(seq 1 120); do
RESPONSE=$(curl -s \
-H "Modal-Key: $MODAL_KEY" \
-H "Modal-Secret: $MODAL_SECRET" \
--max-time 5 \
"${LLM_URL}/v1/models" 2>/dev/null || true)
if echo "$RESPONSE" | grep -q '"id"'; then
echo "[run.sh] LLM ready (took $((i * 5))s)."
HEALTHY=1
if echo "$RESPONSE" | grep -q "oracle-wizard-lora"; then
SAW_LORA=1
echo "[run.sh] ✓ Fine-tune adapter 'oracle-wizard-lora' is served."
fi
break
fi
if [ $((i % 6)) -eq 0 ]; then
echo " ... still waiting ($((i * 5))s)"
fi
sleep 5
done
if [ "$HEALTHY" = "0" ]; then
echo "WARN: endpoint never returned a model list within 10 minutes."
elif [ "$USE_BASE_MODEL" = "0" ] && [ "$SAW_LORA" = "0" ]; then
echo "WARN: endpoint is up but did NOT advertise 'oracle-wizard-lora'."
fi
# -----------------------------------------------------------------------------
# Hand off to the app.
# -----------------------------------------------------------------------------
export MODAL_URL="$LLM_URL"
export ORACLES_FORCE_MOCK=0
echo ""
echo "============================================================"
echo " Open in your browser: http://localhost:7860"
echo " (NOT http://0.0.0.0:7860 — Chrome blocks that by default)"
echo "============================================================"
echo ""
"$PY" app.py "${APP_ARGS[@]}"