#!/usr/bin/env bash
# Launch The Wizard's Oracles.
#
# Default backend: Modal-hosted vLLM + LoRA on an L40S in the cloud.
# Optional backend (--local-llama): llama.cpp running on YOUR machine,
# fully offline — see below.
#
# Flags:
#   --base-model        Request bare base Qwen (no LoRA) from whichever
#                       backend is running. Equivalent to
#                       ORACLES_LLM_MODEL=llm.
#   --keep-warm         (default) Leave the Modal LLM running on exit
#                       so the deployed HF Space can still reach it.
#                       Ignored in --local-llama mode.
#   --stop-on-exit      Inverse of --keep-warm: stop the Modal app when
#                       this script exits. Useful only if you know no
#                       other clients (HF Space, teammates) are using it.
#   --local-llama       Run llama-cpp-python locally on a GGUF copy of
#                       the model — no Modal calls, no cloud. Requires
#                       the .gguf file already on disk (see env var
#                       ORACLES_GGUF_PATH below).
#   --save-trace [DIR]  (default ON) Append every LLM request/response to
#                       a JSONL file under DIR (default: ./traces/). Drop
#                       the file into an HF dataset repo for the
#                       Sharing-is-Caring badge.
#   --no-trace          Disable trace writing for this run (sets
#                       ORACLES_TRACE_DISABLE=1).
#   --full              Enable ALL decorative PNGs (parallax banner,
#                       parchment, phase backdrops, scene landscapes,
#                       wizard-desk, open-book, demo-card backdrop).
#                       Recommended for local runs on fast connections.
#   --lean              (default) Skip the heavy decorative PNGs. Used by
#                       the HF Space deployment because its egress
#                       bandwidth (~15 KB/s) makes the full set add
#                       multi-megabytes to every cold load.
#
# Env vars:
#   ORACLES_LLM_MODEL=llm                       request base model
#   ORACLES_LLM_MODEL=oracle-wizard-lora        (default) request fine-tune
#   ORACLES_VISUAL_MODE=full                    same as --full
#   ORACLES_VISUAL_MODE=lean                    (default) same as --lean
#   KEEP_LLM_WARM=1                             keep Modal LLM running
#   ORACLES_GGUF_PATH=/path/to/model.gguf       used in --local-llama mode
#   ORACLES_LOCAL_LLAMA_PORT=8080               (default) llama-server port
#
# Default-backend prereqs (.env.local or shell):
#   MODAL_URL, MODAL_KEY, MODAL_SECRET — set by `modal setup` + proxy tokens.
#
# Local-backend prereqs:
#   .venv/bin/pip install 'llama-cpp-python[server]'
#   modal volume get oracles-lora-ckpts /gguf ./gguf-out  (one-time download)
#   ORACLES_GGUF_PATH=./gguf-out/oracles-wizard-14b-q4_k_m.gguf

set -e
cd "$(dirname "$0")"

# -----------------------------------------------------------------------------
# Parse our own flags (everything else gets passed through to app.py)
# -----------------------------------------------------------------------------
# Default to keep-warm so the deployed HF Space (which shares the same
# Modal endpoint) doesn't get an APIConnectionError every time a local
# run exits. Pass --stop-on-exit to opt out and stop billing.
KEEP_WARM="${KEEP_LLM_WARM:-1}"
USE_BASE_MODEL="0"
USE_LOCAL_LLAMA="0"
SAVE_TRACE_DIR=""
APP_ARGS=()
for arg in "$@"; do
    case "$arg" in
        --base-model)        USE_BASE_MODEL="1" ;;
        --keep-warm)         KEEP_WARM="1" ;;      # back-compat noop
        --stop-on-exit)      KEEP_WARM="0" ;;
        --local-llama)       USE_LOCAL_LLAMA="1" ;;
        --save-trace)        SAVE_TRACE_DIR="./traces" ;;
        --save-trace=*)      SAVE_TRACE_DIR="${arg#--save-trace=}" ;;
        --no-trace)          export ORACLES_TRACE_DISABLE="1" ;;
        --full)              export ORACLES_VISUAL_MODE="full" ;;
        --lean)              export ORACLES_VISUAL_MODE="lean" ;;
        *)                   APP_ARGS+=("$arg") ;;
    esac
done

if [ -n "$SAVE_TRACE_DIR" ]; then
    mkdir -p "$SAVE_TRACE_DIR"
    export ORACLES_TRACE_DIR="$SAVE_TRACE_DIR"
    echo "[run.sh] --save-trace: appending LLM exchanges to $SAVE_TRACE_DIR/oracles-trace-<session>.jsonl"
fi

# Visual-mode banner. The app defaults to lean; --full overrides for local
# bandwidth-rich runs that want the parallax banner, parchment texture,
# phase backdrops, scene landscapes, etc.
if [ "${ORACLES_VISUAL_MODE:-lean}" = "full" ]; then
    echo "[run.sh] --full: ORACLES_VISUAL_MODE=full (all PNGs / textures / backdrops enabled)"
else
    echo "[run.sh] lean mode (default) — pass --full to enable all visuals"
fi

# -----------------------------------------------------------------------------
# Load .env.local — look in both the project root and oracles_app/ so
# shared credentials are picked up.
# -----------------------------------------------------------------------------
for env_file in "../.env.local" ".env.local"; do
    if [ -f "$env_file" ]; then
        set -a; . "$env_file"; set +a
    fi
done

# Skip Gradio's import-time analytics + HuggingFace probe — both can hang on
# old SSL stacks. Also keeps boot fast.
export GRADIO_ANALYTICS_ENABLED=${GRADIO_ANALYTICS_ENABLED:-0}
export HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-1}

# Pick a Python.
if [ -x "../.venv/bin/python" ]; then PY="../.venv/bin/python"
elif [ -x ".venv/bin/python" ]; then PY=".venv/bin/python"
else PY="python3"; fi

"$PY" -c "import gradio, openai" 2>/dev/null || {
    echo "Missing dependencies. Install with: $PY -m pip install -r requirements.txt"
    exit 1
}

# -----------------------------------------------------------------------------
# Decide which served model the app will request.
# -----------------------------------------------------------------------------
if [ "$USE_BASE_MODEL" = "1" ]; then
    export ORACLES_LLM_MODEL="llm"
    echo "[run.sh] --base-model: requesting bare base Qwen2.5-14B (no LoRA)."
else
    export ORACLES_LLM_MODEL="${ORACLES_LLM_MODEL:-oracle-wizard-lora}"
    echo "[run.sh] Requesting fine-tune: $ORACLES_LLM_MODEL"
fi

# =============================================================================
# BACKEND BRANCH 1 — Local llama.cpp (--local-llama)
# Runs llama-cpp-python's OpenAI-compatible server in the background. The
# Gradio app sees it as just another endpoint via MODAL_URL=http://localhost.
# =============================================================================
if [ "$USE_LOCAL_LLAMA" = "1" ]; then
    GGUF_PATH="${ORACLES_GGUF_PATH:-./gguf-out/oracles-wizard-14b-q4_k_m.gguf}"
    LL_PORT="${ORACLES_LOCAL_LLAMA_PORT:-8080}"

    if [ ! -f "$GGUF_PATH" ]; then
        cat <<EOF >&2
ERROR: GGUF file not found at $GGUF_PATH

To use --local-llama you first need to download the quantized GGUF:
    modal volume get oracles-lora-ckpts /gguf ./gguf-out

Then either:
  ./run.sh --local-llama
  (which expects ./gguf-out/oracles-wizard-14b-q4_k_m.gguf by default)

Or set the path explicitly:
  ORACLES_GGUF_PATH=/abs/path/to/model.gguf ./run.sh --local-llama
EOF
        exit 1
    fi

    if ! "$PY" -c "import llama_cpp.server" 2>/dev/null; then
        echo "ERROR: llama-cpp-python's server module isn't installed." >&2
        echo "       Install with:  $PY -m pip install 'llama-cpp-python[server]'" >&2
        exit 1
    fi

    echo "[run.sh] --local-llama: starting llama_cpp.server in the background"
    echo "         model = $GGUF_PATH"
    echo "         port  = $LL_PORT"

    # Start llama-cpp's OpenAI-compatible server. On Apple Silicon we want
    # n_gpu_layers=-1 so the Metal backend takes the whole model.
    LL_PID_FILE=$(mktemp -t oracles_local_llama_pid.XXXXXX)
    "$PY" -m llama_cpp.server \
        --model "$GGUF_PATH" \
        --host 127.0.0.1 \
        --port "$LL_PORT" \
        --n_gpu_layers -1 \
        --n_ctx 8192 \
        --model_alias oracle-wizard-lora \
        > /tmp/oracles_local_llama.log 2>&1 &
    LL_PID=$!
    echo "$LL_PID" > "$LL_PID_FILE"
    echo "[run.sh] llama_cpp.server PID=$LL_PID (log: /tmp/oracles_local_llama.log)"

    # Cleanup trap: kill local server on exit
    cleanup_local() {
        local rc=$?
        if kill -0 "$LL_PID" 2>/dev/null; then
            echo ""
            echo "[run.sh] Stopping local llama-cpp server (PID=$LL_PID)..."
            kill "$LL_PID" 2>/dev/null || true
            wait "$LL_PID" 2>/dev/null || true
        fi
        rm -f "$LL_PID_FILE"
        exit "$rc"
    }
    trap cleanup_local EXIT INT TERM

    # Wait for the server to load the model. 14B Q4 on M-series ~15-40s.
    echo "[run.sh] Waiting for local server to load the model..."
    HEALTHY=0
    for i in $(seq 1 120); do  # 120 * 2s = 4 min max
        if curl -s --max-time 2 "http://127.0.0.1:$LL_PORT/v1/models" \
                | grep -q '"id"'; then
            echo "[run.sh] Local server ready (took $((i * 2))s)."
            HEALTHY=1
            break
        fi
        if ! kill -0 "$LL_PID" 2>/dev/null; then
            echo "ERROR: llama_cpp.server died early. Check /tmp/oracles_local_llama.log"
            exit 1
        fi
        if [ $((i % 5)) -eq 0 ]; then
            echo "  ... still loading ($((i * 2))s)"
        fi
        sleep 2
    done

    if [ "$HEALTHY" = "0" ]; then
        echo "ERROR: local server never became ready. See /tmp/oracles_local_llama.log"
        exit 1
    fi

    # Point the app at the local endpoint. The existing LLMClient already
    # speaks OpenAI's protocol so zero client code changes are needed.
    export MODAL_URL="http://127.0.0.1:$LL_PORT"
    export MODAL_KEY="local"      # any non-empty value — local server ignores
    export MODAL_SECRET="local"
    export ORACLES_FORCE_MOCK=0
    echo "[run.sh] App pointed at $MODAL_URL — no Modal calls will be made."
    echo ""

    "$PY" app.py "${APP_ARGS[@]}"
    exit 0
fi


# =============================================================================
# BACKEND BRANCH 2 — Modal vLLM (default)
# =============================================================================
LLM_APP_NAME="forest-focus-llm"
REPO_ROOT="$(cd .. && pwd)"
LLM_SCRIPT="$REPO_ROOT/modal_backend/modal_llm.py"

if [ -z "${MODAL_KEY:-}" ] || [ -z "${MODAL_SECRET:-}" ]; then
    cat <<'EOF' >&2
ERROR: MODAL_KEY and MODAL_SECRET are required for the default Modal backend.

For the fully-offline local backend (no Modal needed), run with --local-llama
after downloading the GGUF:
    modal volume get oracles-lora-ckpts /gguf ./gguf-out
    ./run.sh --local-llama

To use the default Modal backend, one-time setup:
  1. modal deploy modal_backend/modal_llm.py
  2. https://modal.com/settings/proxy-auth-tokens  →  Create
  3. Add to oracles_app/.env.local:
       MODAL_KEY=wk-xxxxxxxxx
       MODAL_SECRET=ws-xxxxxxxxx
  4. ./run.sh

To swap the fine-tune for the bare base model: ./run.sh --base-model
EOF
    exit 1
fi

if ! command -v modal >/dev/null 2>&1; then
    echo "ERROR: 'modal' CLI not found on PATH. Install: pip install modal" >&2
    exit 1
fi
if [ ! -f "$LLM_SCRIPT" ]; then
    echo "ERROR: LLM script not found at $LLM_SCRIPT" >&2
    exit 1
fi

# Resolve the workspace name — the Modal proxy URL is determined by it.
WORKSPACE=$(modal profile current 2>/dev/null | head -1 | tr -d '[:space:]')
if [ -z "$WORKSPACE" ]; then
    echo "ERROR: could not determine Modal workspace. Run 'modal setup' first."
    exit 1
fi
LLM_URL="https://${WORKSPACE}--${LLM_APP_NAME}-serve.modal.run"

echo "[run.sh] Workspace: $WORKSPACE"
echo "[run.sh] LLM URL:   $LLM_URL"

# -----------------------------------------------------------------------------
# Deploy if not already deployed.
# -----------------------------------------------------------------------------
echo "[run.sh] Ensuring $LLM_APP_NAME is deployed..."
if ! modal deploy "$LLM_SCRIPT" 2>&1 | tee /tmp/oracles_modal_deploy.log; then
    echo "ERROR: modal deploy failed. See /tmp/oracles_modal_deploy.log"
    exit 1
fi

PRINTED_URL=$(grep -oE 'https://[A-Za-z0-9.-]+\.modal\.run' /tmp/oracles_modal_deploy.log | head -1 || true)
if [ -n "$PRINTED_URL" ]; then
    LLM_URL="$PRINTED_URL"
    echo "[run.sh] Using URL from deploy output: $LLM_URL"
fi

# -----------------------------------------------------------------------------
# Cleanup trap — stop the Modal app when we exit (unless --keep-warm).
# -----------------------------------------------------------------------------
cleanup() {
    local rc=$?
    if [ "$KEEP_WARM" = "1" ]; then
        echo ""
        echo "[run.sh] --keep-warm set; leaving $LLM_APP_NAME running on Modal."
    else
        echo ""
        echo "[run.sh] Stopping $LLM_APP_NAME so the L40S stops billing..."
        modal app stop --yes "$LLM_APP_NAME" 2>/dev/null || true
        echo "[run.sh] Stopped."
    fi
    exit "$rc"
}
trap cleanup EXIT INT TERM

# -----------------------------------------------------------------------------
# Wait for the endpoint.
# -----------------------------------------------------------------------------
echo "[run.sh] Waiting for endpoint to become healthy (up to 10 minutes)..."
HEALTHY=0
SAW_LORA=0
for i in $(seq 1 120); do
    RESPONSE=$(curl -s \
            -H "Modal-Key: $MODAL_KEY" \
            -H "Modal-Secret: $MODAL_SECRET" \
            --max-time 5 \
            "${LLM_URL}/v1/models" 2>/dev/null || true)
    if echo "$RESPONSE" | grep -q '"id"'; then
        echo "[run.sh] LLM ready (took $((i * 5))s)."
        HEALTHY=1
        if echo "$RESPONSE" | grep -q "oracle-wizard-lora"; then
            SAW_LORA=1
            echo "[run.sh] ✓ Fine-tune adapter 'oracle-wizard-lora' is served."
        fi
        break
    fi
    if [ $((i % 6)) -eq 0 ]; then
        echo "  ... still waiting ($((i * 5))s)"
    fi
    sleep 5
done

if [ "$HEALTHY" = "0" ]; then
    echo "WARN: endpoint never returned a model list within 10 minutes."
elif [ "$USE_BASE_MODEL" = "0" ] && [ "$SAW_LORA" = "0" ]; then
    echo "WARN: endpoint is up but did NOT advertise 'oracle-wizard-lora'."
fi

# -----------------------------------------------------------------------------
# Hand off to the app.
# -----------------------------------------------------------------------------
export MODAL_URL="$LLM_URL"
export ORACLES_FORCE_MOCK=0

echo ""
echo "============================================================"
echo "  Open in your browser:  http://localhost:7860"
echo "  (NOT http://0.0.0.0:7860 — Chrome blocks that by default)"
echo "============================================================"
echo ""

"$PY" app.py "${APP_ARGS[@]}"