AndrewRqy
Clean-Space pass: drop self-test, legacy unused handler, BISECT_MINIMAL branch, duplicate banner PNG, stale comments
f9490c0 | # Launch The Wizard's Oracles. | |
| # | |
| # Default backend: Modal-hosted vLLM + LoRA on an L40S in the cloud. | |
| # Optional backend (--local-llama): llama.cpp running on YOUR machine, | |
| # fully offline — see below. | |
| # | |
| # Flags: | |
| # --base-model Request bare base Qwen (no LoRA) from whichever | |
| # backend is running. Equivalent to | |
| # ORACLES_LLM_MODEL=llm. | |
| # --keep-warm (default) Leave the Modal LLM running on exit | |
| # so the deployed HF Space can still reach it. | |
| # Ignored in --local-llama mode. | |
| # --stop-on-exit Inverse of --keep-warm: stop the Modal app when | |
| # this script exits. Useful only if you know no | |
| # other clients (HF Space, teammates) are using it. | |
| # --local-llama Run llama-cpp-python locally on a GGUF copy of | |
| # the model — no Modal calls, no cloud. Requires | |
| # the .gguf file already on disk (see env var | |
| # ORACLES_GGUF_PATH below). | |
| # --save-trace [DIR] (default ON) Append every LLM request/response to | |
| # a JSONL file under DIR (default: ./traces/). Drop | |
| # the file into an HF dataset repo for the | |
| # Sharing-is-Caring badge. | |
| # --no-trace Disable trace writing for this run (sets | |
| # ORACLES_TRACE_DISABLE=1). | |
| # --full Enable ALL decorative PNGs (parallax banner, | |
| # parchment, phase backdrops, scene landscapes, | |
| # wizard-desk, open-book, demo-card backdrop). | |
| # Recommended for local runs on fast connections. | |
| # --lean (default) Skip the heavy decorative PNGs. Used by | |
| # the HF Space deployment because its egress | |
| # bandwidth (~15 KB/s) makes the full set add | |
| # multi-megabytes to every cold load. | |
| # | |
| # Env vars: | |
| # ORACLES_LLM_MODEL=llm request base model | |
| # ORACLES_LLM_MODEL=oracle-wizard-lora (default) request fine-tune | |
| # ORACLES_VISUAL_MODE=full same as --full | |
| # ORACLES_VISUAL_MODE=lean (default) same as --lean | |
| # KEEP_LLM_WARM=1 keep Modal LLM running | |
| # ORACLES_GGUF_PATH=/path/to/model.gguf used in --local-llama mode | |
| # ORACLES_LOCAL_LLAMA_PORT=8080 (default) llama-server port | |
| # | |
| # Default-backend prereqs (.env.local or shell): | |
| # MODAL_URL, MODAL_KEY, MODAL_SECRET — set by `modal setup` + proxy tokens. | |
| # | |
| # Local-backend prereqs: | |
| # .venv/bin/pip install 'llama-cpp-python[server]' | |
| # modal volume get oracles-lora-ckpts /gguf ./gguf-out (one-time download) | |
| # ORACLES_GGUF_PATH=./gguf-out/oracles-wizard-14b-q4_k_m.gguf | |
| set -e | |
| cd "$(dirname "$0")" | |
| # ----------------------------------------------------------------------------- | |
| # Parse our own flags (everything else gets passed through to app.py) | |
| # ----------------------------------------------------------------------------- | |
| # Default to keep-warm so the deployed HF Space (which shares the same | |
| # Modal endpoint) doesn't get an APIConnectionError every time a local | |
| # run exits. Pass --stop-on-exit to opt out and stop billing. | |
| KEEP_WARM="${KEEP_LLM_WARM:-1}" | |
| USE_BASE_MODEL="0" | |
| USE_LOCAL_LLAMA="0" | |
| SAVE_TRACE_DIR="" | |
| APP_ARGS=() | |
| for arg in "$@"; do | |
| case "$arg" in | |
| --base-model) USE_BASE_MODEL="1" ;; | |
| --keep-warm) KEEP_WARM="1" ;; # back-compat noop | |
| --stop-on-exit) KEEP_WARM="0" ;; | |
| --local-llama) USE_LOCAL_LLAMA="1" ;; | |
| --save-trace) SAVE_TRACE_DIR="./traces" ;; | |
| --save-trace=*) SAVE_TRACE_DIR="${arg#--save-trace=}" ;; | |
| --no-trace) export ORACLES_TRACE_DISABLE="1" ;; | |
| --full) export ORACLES_VISUAL_MODE="full" ;; | |
| --lean) export ORACLES_VISUAL_MODE="lean" ;; | |
| *) APP_ARGS+=("$arg") ;; | |
| esac | |
| done | |
| if [ -n "$SAVE_TRACE_DIR" ]; then | |
| mkdir -p "$SAVE_TRACE_DIR" | |
| export ORACLES_TRACE_DIR="$SAVE_TRACE_DIR" | |
| echo "[run.sh] --save-trace: appending LLM exchanges to $SAVE_TRACE_DIR/oracles-trace-<session>.jsonl" | |
| fi | |
| # Visual-mode banner. The app defaults to lean; --full overrides for local | |
| # bandwidth-rich runs that want the parallax banner, parchment texture, | |
| # phase backdrops, scene landscapes, etc. | |
| if [ "${ORACLES_VISUAL_MODE:-lean}" = "full" ]; then | |
| echo "[run.sh] --full: ORACLES_VISUAL_MODE=full (all PNGs / textures / backdrops enabled)" | |
| else | |
| echo "[run.sh] lean mode (default) — pass --full to enable all visuals" | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # Load .env.local — look in both the project root and oracles_app/ so | |
| # shared credentials are picked up. | |
| # ----------------------------------------------------------------------------- | |
| for env_file in "../.env.local" ".env.local"; do | |
| if [ -f "$env_file" ]; then | |
| set -a; . "$env_file"; set +a | |
| fi | |
| done | |
| # Skip Gradio's import-time analytics + HuggingFace probe — both can hang on | |
| # old SSL stacks. Also keeps boot fast. | |
| export GRADIO_ANALYTICS_ENABLED=${GRADIO_ANALYTICS_ENABLED:-0} | |
| export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1} | |
| # Pick a Python. | |
| if [ -x "../.venv/bin/python" ]; then PY="../.venv/bin/python" | |
| elif [ -x ".venv/bin/python" ]; then PY=".venv/bin/python" | |
| else PY="python3"; fi | |
| "$PY" -c "import gradio, openai" 2>/dev/null || { | |
| echo "Missing dependencies. Install with: $PY -m pip install -r requirements.txt" | |
| exit 1 | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # Decide which served model the app will request. | |
| # ----------------------------------------------------------------------------- | |
| if [ "$USE_BASE_MODEL" = "1" ]; then | |
| export ORACLES_LLM_MODEL="llm" | |
| echo "[run.sh] --base-model: requesting bare base Qwen2.5-14B (no LoRA)." | |
| else | |
| export ORACLES_LLM_MODEL="${ORACLES_LLM_MODEL:-oracle-wizard-lora}" | |
| echo "[run.sh] Requesting fine-tune: $ORACLES_LLM_MODEL" | |
| fi | |
| # ============================================================================= | |
| # BACKEND BRANCH 1 — Local llama.cpp (--local-llama) | |
| # Runs llama-cpp-python's OpenAI-compatible server in the background. The | |
| # Gradio app sees it as just another endpoint via MODAL_URL=http://localhost. | |
| # ============================================================================= | |
| if [ "$USE_LOCAL_LLAMA" = "1" ]; then | |
| GGUF_PATH="${ORACLES_GGUF_PATH:-./gguf-out/oracles-wizard-14b-q4_k_m.gguf}" | |
| LL_PORT="${ORACLES_LOCAL_LLAMA_PORT:-8080}" | |
| if [ ! -f "$GGUF_PATH" ]; then | |
| cat <<EOF >&2 | |
| ERROR: GGUF file not found at $GGUF_PATH | |
| To use --local-llama you first need to download the quantized GGUF: | |
| modal volume get oracles-lora-ckpts /gguf ./gguf-out | |
| Then either: | |
| ./run.sh --local-llama | |
| (which expects ./gguf-out/oracles-wizard-14b-q4_k_m.gguf by default) | |
| Or set the path explicitly: | |
| ORACLES_GGUF_PATH=/abs/path/to/model.gguf ./run.sh --local-llama | |
| EOF | |
| exit 1 | |
| fi | |
| if ! "$PY" -c "import llama_cpp.server" 2>/dev/null; then | |
| echo "ERROR: llama-cpp-python's server module isn't installed." >&2 | |
| echo " Install with: $PY -m pip install 'llama-cpp-python[server]'" >&2 | |
| exit 1 | |
| fi | |
| echo "[run.sh] --local-llama: starting llama_cpp.server in the background" | |
| echo " model = $GGUF_PATH" | |
| echo " port = $LL_PORT" | |
| # Start llama-cpp's OpenAI-compatible server. On Apple Silicon we want | |
| # n_gpu_layers=-1 so the Metal backend takes the whole model. | |
| LL_PID_FILE=$(mktemp -t oracles_local_llama_pid.XXXXXX) | |
| "$PY" -m llama_cpp.server \ | |
| --model "$GGUF_PATH" \ | |
| --host 127.0.0.1 \ | |
| --port "$LL_PORT" \ | |
| --n_gpu_layers -1 \ | |
| --n_ctx 8192 \ | |
| --model_alias oracle-wizard-lora \ | |
| > /tmp/oracles_local_llama.log 2>&1 & | |
| LL_PID=$! | |
| echo "$LL_PID" > "$LL_PID_FILE" | |
| echo "[run.sh] llama_cpp.server PID=$LL_PID (log: /tmp/oracles_local_llama.log)" | |
| # Cleanup trap: kill local server on exit | |
| cleanup_local() { | |
| local rc=$? | |
| if kill -0 "$LL_PID" 2>/dev/null; then | |
| echo "" | |
| echo "[run.sh] Stopping local llama-cpp server (PID=$LL_PID)..." | |
| kill "$LL_PID" 2>/dev/null || true | |
| wait "$LL_PID" 2>/dev/null || true | |
| fi | |
| rm -f "$LL_PID_FILE" | |
| exit "$rc" | |
| } | |
| trap cleanup_local EXIT INT TERM | |
| # Wait for the server to load the model. 14B Q4 on M-series ~15-40s. | |
| echo "[run.sh] Waiting for local server to load the model..." | |
| HEALTHY=0 | |
| for i in $(seq 1 120); do # 120 * 2s = 4 min max | |
| if curl -s --max-time 2 "http://127.0.0.1:$LL_PORT/v1/models" \ | |
| | grep -q '"id"'; then | |
| echo "[run.sh] Local server ready (took $((i * 2))s)." | |
| HEALTHY=1 | |
| break | |
| fi | |
| if ! kill -0 "$LL_PID" 2>/dev/null; then | |
| echo "ERROR: llama_cpp.server died early. Check /tmp/oracles_local_llama.log" | |
| exit 1 | |
| fi | |
| if [ $((i % 5)) -eq 0 ]; then | |
| echo " ... still loading ($((i * 2))s)" | |
| fi | |
| sleep 2 | |
| done | |
| if [ "$HEALTHY" = "0" ]; then | |
| echo "ERROR: local server never became ready. See /tmp/oracles_local_llama.log" | |
| exit 1 | |
| fi | |
| # Point the app at the local endpoint. The existing LLMClient already | |
| # speaks OpenAI's protocol so zero client code changes are needed. | |
| export MODAL_URL="http://127.0.0.1:$LL_PORT" | |
| export MODAL_KEY="local" # any non-empty value — local server ignores | |
| export MODAL_SECRET="local" | |
| export ORACLES_FORCE_MOCK=0 | |
| echo "[run.sh] App pointed at $MODAL_URL — no Modal calls will be made." | |
| echo "" | |
| "$PY" app.py "${APP_ARGS[@]}" | |
| exit 0 | |
| fi | |
| # ============================================================================= | |
| # BACKEND BRANCH 2 — Modal vLLM (default) | |
| # ============================================================================= | |
| LLM_APP_NAME="forest-focus-llm" | |
| REPO_ROOT="$(cd .. && pwd)" | |
| LLM_SCRIPT="$REPO_ROOT/modal_backend/modal_llm.py" | |
| if [ -z "${MODAL_KEY:-}" ] || [ -z "${MODAL_SECRET:-}" ]; then | |
| cat <<'EOF' >&2 | |
| ERROR: MODAL_KEY and MODAL_SECRET are required for the default Modal backend. | |
| For the fully-offline local backend (no Modal needed), run with --local-llama | |
| after downloading the GGUF: | |
| modal volume get oracles-lora-ckpts /gguf ./gguf-out | |
| ./run.sh --local-llama | |
| To use the default Modal backend, one-time setup: | |
| 1. modal deploy modal_backend/modal_llm.py | |
| 2. https://modal.com/settings/proxy-auth-tokens → Create | |
| 3. Add to oracles_app/.env.local: | |
| MODAL_KEY=wk-xxxxxxxxx | |
| MODAL_SECRET=ws-xxxxxxxxx | |
| 4. ./run.sh | |
| To swap the fine-tune for the bare base model: ./run.sh --base-model | |
| EOF | |
| exit 1 | |
| fi | |
| if ! command -v modal >/dev/null 2>&1; then | |
| echo "ERROR: 'modal' CLI not found on PATH. Install: pip install modal" >&2 | |
| exit 1 | |
| fi | |
| if [ ! -f "$LLM_SCRIPT" ]; then | |
| echo "ERROR: LLM script not found at $LLM_SCRIPT" >&2 | |
| exit 1 | |
| fi | |
| # Resolve the workspace name — the Modal proxy URL is determined by it. | |
| WORKSPACE=$(modal profile current 2>/dev/null | head -1 | tr -d '[:space:]') | |
| if [ -z "$WORKSPACE" ]; then | |
| echo "ERROR: could not determine Modal workspace. Run 'modal setup' first." | |
| exit 1 | |
| fi | |
| LLM_URL="https://${WORKSPACE}--${LLM_APP_NAME}-serve.modal.run" | |
| echo "[run.sh] Workspace: $WORKSPACE" | |
| echo "[run.sh] LLM URL: $LLM_URL" | |
| # ----------------------------------------------------------------------------- | |
| # Deploy if not already deployed. | |
| # ----------------------------------------------------------------------------- | |
| echo "[run.sh] Ensuring $LLM_APP_NAME is deployed..." | |
| if ! modal deploy "$LLM_SCRIPT" 2>&1 | tee /tmp/oracles_modal_deploy.log; then | |
| echo "ERROR: modal deploy failed. See /tmp/oracles_modal_deploy.log" | |
| exit 1 | |
| fi | |
| PRINTED_URL=$(grep -oE 'https://[A-Za-z0-9.-]+\.modal\.run' /tmp/oracles_modal_deploy.log | head -1 || true) | |
| if [ -n "$PRINTED_URL" ]; then | |
| LLM_URL="$PRINTED_URL" | |
| echo "[run.sh] Using URL from deploy output: $LLM_URL" | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # Cleanup trap — stop the Modal app when we exit (unless --keep-warm). | |
| # ----------------------------------------------------------------------------- | |
| cleanup() { | |
| local rc=$? | |
| if [ "$KEEP_WARM" = "1" ]; then | |
| echo "" | |
| echo "[run.sh] --keep-warm set; leaving $LLM_APP_NAME running on Modal." | |
| else | |
| echo "" | |
| echo "[run.sh] Stopping $LLM_APP_NAME so the L40S stops billing..." | |
| modal app stop --yes "$LLM_APP_NAME" 2>/dev/null || true | |
| echo "[run.sh] Stopped." | |
| fi | |
| exit "$rc" | |
| } | |
| trap cleanup EXIT INT TERM | |
| # ----------------------------------------------------------------------------- | |
| # Wait for the endpoint. | |
| # ----------------------------------------------------------------------------- | |
| echo "[run.sh] Waiting for endpoint to become healthy (up to 10 minutes)..." | |
| HEALTHY=0 | |
| SAW_LORA=0 | |
| for i in $(seq 1 120); do | |
| RESPONSE=$(curl -s \ | |
| -H "Modal-Key: $MODAL_KEY" \ | |
| -H "Modal-Secret: $MODAL_SECRET" \ | |
| --max-time 5 \ | |
| "${LLM_URL}/v1/models" 2>/dev/null || true) | |
| if echo "$RESPONSE" | grep -q '"id"'; then | |
| echo "[run.sh] LLM ready (took $((i * 5))s)." | |
| HEALTHY=1 | |
| if echo "$RESPONSE" | grep -q "oracle-wizard-lora"; then | |
| SAW_LORA=1 | |
| echo "[run.sh] ✓ Fine-tune adapter 'oracle-wizard-lora' is served." | |
| fi | |
| break | |
| fi | |
| if [ $((i % 6)) -eq 0 ]; then | |
| echo " ... still waiting ($((i * 5))s)" | |
| fi | |
| sleep 5 | |
| done | |
| if [ "$HEALTHY" = "0" ]; then | |
| echo "WARN: endpoint never returned a model list within 10 minutes." | |
| elif [ "$USE_BASE_MODEL" = "0" ] && [ "$SAW_LORA" = "0" ]; then | |
| echo "WARN: endpoint is up but did NOT advertise 'oracle-wizard-lora'." | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # Hand off to the app. | |
| # ----------------------------------------------------------------------------- | |
| export MODAL_URL="$LLM_URL" | |
| export ORACLES_FORCE_MOCK=0 | |
| echo "" | |
| echo "============================================================" | |
| echo " Open in your browser: http://localhost:7860" | |
| echo " (NOT http://0.0.0.0:7860 — Chrome blocks that by default)" | |
| echo "============================================================" | |
| echo "" | |
| "$PY" app.py "${APP_ARGS[@]}" | |