the-apprentice

Sleeping

AndrewRqy

Clean-Space pass: drop self-test, legacy unused handler, BISECT_MINIMAL branch, duplicate banner PNG, stale comments

f9490c0 15 days ago

14.7 kB

	#!/usr/bin/env bash
	# Launch The Wizard's Oracles.
	#
	# Default backend: Modal-hosted vLLM + LoRA on an L40S in the cloud.
	# Optional backend (--local-llama): llama.cpp running on YOUR machine,
	# fully offline — see below.
	#
	# Flags:
	# --base-model Request bare base Qwen (no LoRA) from whichever
	# backend is running. Equivalent to
	# ORACLES_LLM_MODEL=llm.
	# --keep-warm (default) Leave the Modal LLM running on exit
	# so the deployed HF Space can still reach it.
	# Ignored in --local-llama mode.
	# --stop-on-exit Inverse of --keep-warm: stop the Modal app when
	# this script exits. Useful only if you know no
	# other clients (HF Space, teammates) are using it.
	# --local-llama Run llama-cpp-python locally on a GGUF copy of
	# the model — no Modal calls, no cloud. Requires
	# the .gguf file already on disk (see env var
	# ORACLES_GGUF_PATH below).
	# --save-trace [DIR] (default ON) Append every LLM request/response to
	# a JSONL file under DIR (default: ./traces/). Drop
	# the file into an HF dataset repo for the
	# Sharing-is-Caring badge.
	# --no-trace Disable trace writing for this run (sets
	# ORACLES_TRACE_DISABLE=1).
	# --full Enable ALL decorative PNGs (parallax banner,
	# parchment, phase backdrops, scene landscapes,
	# wizard-desk, open-book, demo-card backdrop).
	# Recommended for local runs on fast connections.
	# --lean (default) Skip the heavy decorative PNGs. Used by
	# the HF Space deployment because its egress
	# bandwidth (~15 KB/s) makes the full set add
	# multi-megabytes to every cold load.
	#
	# Env vars:
	# ORACLES_LLM_MODEL=llm request base model
	# ORACLES_LLM_MODEL=oracle-wizard-lora (default) request fine-tune
	# ORACLES_VISUAL_MODE=full same as --full
	# ORACLES_VISUAL_MODE=lean (default) same as --lean
	# KEEP_LLM_WARM=1 keep Modal LLM running
	# ORACLES_GGUF_PATH=/path/to/model.gguf used in --local-llama mode
	# ORACLES_LOCAL_LLAMA_PORT=8080 (default) llama-server port
	#
	# Default-backend prereqs (.env.local or shell):
	# MODAL_URL, MODAL_KEY, MODAL_SECRET — set by `modal setup` + proxy tokens.
	#
	# Local-backend prereqs:
	# .venv/bin/pip install 'llama-cpp-python[server]'
	# modal volume get oracles-lora-ckpts /gguf ./gguf-out (one-time download)
	# ORACLES_GGUF_PATH=./gguf-out/oracles-wizard-14b-q4_k_m.gguf

	set -e
	cd "$(dirname "$0")"

	# -----------------------------------------------------------------------------
	# Parse our own flags (everything else gets passed through to app.py)
	# -----------------------------------------------------------------------------
	# Default to keep-warm so the deployed HF Space (which shares the same
	# Modal endpoint) doesn't get an APIConnectionError every time a local
	# run exits. Pass --stop-on-exit to opt out and stop billing.
	KEEP_WARM="${KEEP_LLM_WARM:-1}"
	USE_BASE_MODEL="0"
	USE_LOCAL_LLAMA="0"
	SAVE_TRACE_DIR=""
	APP_ARGS=()
	for arg in "$@"; do
	case "$arg" in
	--base-model) USE_BASE_MODEL="1" ;;
	--keep-warm) KEEP_WARM="1" ;; # back-compat noop
	--stop-on-exit) KEEP_WARM="0" ;;
	--local-llama) USE_LOCAL_LLAMA="1" ;;
	--save-trace) SAVE_TRACE_DIR="./traces" ;;
	--save-trace=*) SAVE_TRACE_DIR="${arg#--save-trace=}" ;;
	--no-trace) export ORACLES_TRACE_DISABLE="1" ;;
	--full) export ORACLES_VISUAL_MODE="full" ;;
	--lean) export ORACLES_VISUAL_MODE="lean" ;;
	*) APP_ARGS+=("$arg") ;;
	esac
	done

	if [ -n "$SAVE_TRACE_DIR" ]; then
	mkdir -p "$SAVE_TRACE_DIR"
	export ORACLES_TRACE_DIR="$SAVE_TRACE_DIR"
	echo "[run.sh] --save-trace: appending LLM exchanges to $SAVE_TRACE_DIR/oracles-trace-<session>.jsonl"
	fi

	# Visual-mode banner. The app defaults to lean; --full overrides for local
	# bandwidth-rich runs that want the parallax banner, parchment texture,
	# phase backdrops, scene landscapes, etc.
	if [ "${ORACLES_VISUAL_MODE:-lean}" = "full" ]; then
	echo "[run.sh] --full: ORACLES_VISUAL_MODE=full (all PNGs / textures / backdrops enabled)"
	else
	echo "[run.sh] lean mode (default) — pass --full to enable all visuals"
	fi

	# -----------------------------------------------------------------------------
	# Load .env.local — look in both the project root and oracles_app/ so
	# shared credentials are picked up.
	# -----------------------------------------------------------------------------
	for env_file in "../.env.local" ".env.local"; do
	if [ -f "$env_file" ]; then
	set -a; . "$env_file"; set +a
	fi
	done

	# Skip Gradio's import-time analytics + HuggingFace probe — both can hang on
	# old SSL stacks. Also keeps boot fast.
	export GRADIO_ANALYTICS_ENABLED=${GRADIO_ANALYTICS_ENABLED:-0}
	export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1}

	# Pick a Python.
	if [ -x "../.venv/bin/python" ]; then PY="../.venv/bin/python"
	elif [ -x ".venv/bin/python" ]; then PY=".venv/bin/python"
	else PY="python3"; fi

	"$PY" -c "import gradio, openai" 2>/dev/null \|\| {
	echo "Missing dependencies. Install with: $PY -m pip install -r requirements.txt"
	exit 1
	}

	# -----------------------------------------------------------------------------
	# Decide which served model the app will request.
	# -----------------------------------------------------------------------------
	if [ "$USE_BASE_MODEL" = "1" ]; then
	export ORACLES_LLM_MODEL="llm"
	echo "[run.sh] --base-model: requesting bare base Qwen2.5-14B (no LoRA)."
	else
	export ORACLES_LLM_MODEL="${ORACLES_LLM_MODEL:-oracle-wizard-lora}"
	echo "[run.sh] Requesting fine-tune: $ORACLES_LLM_MODEL"
	fi

	# =============================================================================
	# BACKEND BRANCH 1 — Local llama.cpp (--local-llama)
	# Runs llama-cpp-python's OpenAI-compatible server in the background. The
	# Gradio app sees it as just another endpoint via MODAL_URL=http://localhost.
	# =============================================================================
	if [ "$USE_LOCAL_LLAMA" = "1" ]; then
	GGUF_PATH="${ORACLES_GGUF_PATH:-./gguf-out/oracles-wizard-14b-q4_k_m.gguf}"
	LL_PORT="${ORACLES_LOCAL_LLAMA_PORT:-8080}"

	if [ ! -f "$GGUF_PATH" ]; then
	cat <<EOF >&2
	ERROR: GGUF file not found at $GGUF_PATH

	To use --local-llama you first need to download the quantized GGUF:
	modal volume get oracles-lora-ckpts /gguf ./gguf-out

	Then either:
	./run.sh --local-llama
	(which expects ./gguf-out/oracles-wizard-14b-q4_k_m.gguf by default)

	Or set the path explicitly:
	ORACLES_GGUF_PATH=/abs/path/to/model.gguf ./run.sh --local-llama
	EOF
	exit 1
	fi

	if ! "$PY" -c "import llama_cpp.server" 2>/dev/null; then
	echo "ERROR: llama-cpp-python's server module isn't installed." >&2
	echo " Install with: $PY -m pip install 'llama-cpp-python[server]'" >&2
	exit 1
	fi

	echo "[run.sh] --local-llama: starting llama_cpp.server in the background"
	echo " model = $GGUF_PATH"
	echo " port = $LL_PORT"

	# Start llama-cpp's OpenAI-compatible server. On Apple Silicon we want
	# n_gpu_layers=-1 so the Metal backend takes the whole model.
	LL_PID_FILE=$(mktemp -t oracles_local_llama_pid.XXXXXX)
	"$PY" -m llama_cpp.server \
	--model "$GGUF_PATH" \
	--host 127.0.0.1 \
	--port "$LL_PORT" \
	--n_gpu_layers -1 \
	--n_ctx 8192 \
	--model_alias oracle-wizard-lora \
	> /tmp/oracles_local_llama.log 2>&1 &
	LL_PID=$!
	echo "$LL_PID" > "$LL_PID_FILE"
	echo "[run.sh] llama_cpp.server PID=$LL_PID (log: /tmp/oracles_local_llama.log)"

	# Cleanup trap: kill local server on exit
	cleanup_local() {
	local rc=$?
	if kill -0 "$LL_PID" 2>/dev/null; then
	echo ""
	echo "[run.sh] Stopping local llama-cpp server (PID=$LL_PID)..."
	kill "$LL_PID" 2>/dev/null \|\| true
	wait "$LL_PID" 2>/dev/null \|\| true
	fi
	rm -f "$LL_PID_FILE"
	exit "$rc"
	}
	trap cleanup_local EXIT INT TERM

	# Wait for the server to load the model. 14B Q4 on M-series ~15-40s.
	echo "[run.sh] Waiting for local server to load the model..."
	HEALTHY=0
	for i in $(seq 1 120); do # 120 * 2s = 4 min max
	if curl -s --max-time 2 "http://127.0.0.1:$LL_PORT/v1/models" \
	\| grep -q '"id"'; then
	echo "[run.sh] Local server ready (took $((i * 2))s)."
	HEALTHY=1
	break
	fi
	if ! kill -0 "$LL_PID" 2>/dev/null; then
	echo "ERROR: llama_cpp.server died early. Check /tmp/oracles_local_llama.log"
	exit 1
	fi
	if [ $((i % 5)) -eq 0 ]; then
	echo " ... still loading ($((i * 2))s)"
	fi
	sleep 2
	done

	if [ "$HEALTHY" = "0" ]; then
	echo "ERROR: local server never became ready. See /tmp/oracles_local_llama.log"
	exit 1
	fi

	# Point the app at the local endpoint. The existing LLMClient already
	# speaks OpenAI's protocol so zero client code changes are needed.
	export MODAL_URL="http://127.0.0.1:$LL_PORT"
	export MODAL_KEY="local" # any non-empty value — local server ignores
	export MODAL_SECRET="local"
	export ORACLES_FORCE_MOCK=0
	echo "[run.sh] App pointed at $MODAL_URL — no Modal calls will be made."
	echo ""

	"$PY" app.py "${APP_ARGS[@]}"
	exit 0
	fi


	# =============================================================================
	# BACKEND BRANCH 2 — Modal vLLM (default)
	# =============================================================================
	LLM_APP_NAME="forest-focus-llm"
	REPO_ROOT="$(cd .. && pwd)"
	LLM_SCRIPT="$REPO_ROOT/modal_backend/modal_llm.py"

	if [ -z "${MODAL_KEY:-}" ] \|\| [ -z "${MODAL_SECRET:-}" ]; then
	cat <<'EOF' >&2
	ERROR: MODAL_KEY and MODAL_SECRET are required for the default Modal backend.

	For the fully-offline local backend (no Modal needed), run with --local-llama
	after downloading the GGUF:
	modal volume get oracles-lora-ckpts /gguf ./gguf-out
	./run.sh --local-llama

	To use the default Modal backend, one-time setup:
	1. modal deploy modal_backend/modal_llm.py
	2. https://modal.com/settings/proxy-auth-tokens → Create
	3. Add to oracles_app/.env.local:
	MODAL_KEY=wk-xxxxxxxxx
	MODAL_SECRET=ws-xxxxxxxxx
	4. ./run.sh

	To swap the fine-tune for the bare base model: ./run.sh --base-model
	EOF
	exit 1
	fi

	if ! command -v modal >/dev/null 2>&1; then
	echo "ERROR: 'modal' CLI not found on PATH. Install: pip install modal" >&2
	exit 1
	fi
	if [ ! -f "$LLM_SCRIPT" ]; then
	echo "ERROR: LLM script not found at $LLM_SCRIPT" >&2
	exit 1
	fi

	# Resolve the workspace name — the Modal proxy URL is determined by it.
	WORKSPACE=$(modal profile current 2>/dev/null \| head -1 \| tr -d '[:space:]')
	if [ -z "$WORKSPACE" ]; then
	echo "ERROR: could not determine Modal workspace. Run 'modal setup' first."
	exit 1
	fi
	LLM_URL="https://${WORKSPACE}--${LLM_APP_NAME}-serve.modal.run"

	echo "[run.sh] Workspace: $WORKSPACE"
	echo "[run.sh] LLM URL: $LLM_URL"

	# -----------------------------------------------------------------------------
	# Deploy if not already deployed.
	# -----------------------------------------------------------------------------
	echo "[run.sh] Ensuring $LLM_APP_NAME is deployed..."
	if ! modal deploy "$LLM_SCRIPT" 2>&1 \| tee /tmp/oracles_modal_deploy.log; then
	echo "ERROR: modal deploy failed. See /tmp/oracles_modal_deploy.log"
	exit 1
	fi

	PRINTED_URL=$(grep -oE 'https://[A-Za-z0-9.-]+\.modal\.run' /tmp/oracles_modal_deploy.log \| head -1 \|\| true)
	if [ -n "$PRINTED_URL" ]; then
	LLM_URL="$PRINTED_URL"
	echo "[run.sh] Using URL from deploy output: $LLM_URL"
	fi

	# -----------------------------------------------------------------------------
	# Cleanup trap — stop the Modal app when we exit (unless --keep-warm).
	# -----------------------------------------------------------------------------
	cleanup() {
	local rc=$?
	if [ "$KEEP_WARM" = "1" ]; then
	echo ""
	echo "[run.sh] --keep-warm set; leaving $LLM_APP_NAME running on Modal."
	else
	echo ""
	echo "[run.sh] Stopping $LLM_APP_NAME so the L40S stops billing..."
	modal app stop --yes "$LLM_APP_NAME" 2>/dev/null \|\| true
	echo "[run.sh] Stopped."
	fi
	exit "$rc"
	}
	trap cleanup EXIT INT TERM

	# -----------------------------------------------------------------------------
	# Wait for the endpoint.
	# -----------------------------------------------------------------------------
	echo "[run.sh] Waiting for endpoint to become healthy (up to 10 minutes)..."
	HEALTHY=0
	SAW_LORA=0
	for i in $(seq 1 120); do
	RESPONSE=$(curl -s \
	-H "Modal-Key: $MODAL_KEY" \
	-H "Modal-Secret: $MODAL_SECRET" \
	--max-time 5 \
	"${LLM_URL}/v1/models" 2>/dev/null \|\| true)
	if echo "$RESPONSE" \| grep -q '"id"'; then
	echo "[run.sh] LLM ready (took $((i * 5))s)."
	HEALTHY=1
	if echo "$RESPONSE" \| grep -q "oracle-wizard-lora"; then
	SAW_LORA=1
	echo "[run.sh] ✓ Fine-tune adapter 'oracle-wizard-lora' is served."
	fi
	break
	fi
	if [ $((i % 6)) -eq 0 ]; then
	echo " ... still waiting ($((i * 5))s)"
	fi
	sleep 5
	done

	if [ "$HEALTHY" = "0" ]; then
	echo "WARN: endpoint never returned a model list within 10 minutes."
	elif [ "$USE_BASE_MODEL" = "0" ] && [ "$SAW_LORA" = "0" ]; then
	echo "WARN: endpoint is up but did NOT advertise 'oracle-wizard-lora'."
	fi

	# -----------------------------------------------------------------------------
	# Hand off to the app.
	# -----------------------------------------------------------------------------
	export MODAL_URL="$LLM_URL"
	export ORACLES_FORCE_MOCK=0

	echo ""
	echo "============================================================"
	echo " Open in your browser: http://localhost:7860"
	echo " (NOT http://0.0.0.0:7860 — Chrome blocks that by default)"
	echo "============================================================"
	echo ""

	"$PY" app.py "${APP_ARGS[@]}"