#!/bin/bash

# Check if model URL is provided
if [ -z "$1" ]; then
    echo "Error: Model URL is required"
    echo "Usage: $0 <model_url>"
    echo "Example: sh $0 https://huggingface.co/Salesforce/Llama-xLAM-2-8b-fc-r-gguf/resolve/main/Llama-xLAM-2-8B-fc-r-Q4_K_M.gguf"
    exit 1
fi

# Activate development environment
source ./.venv/bin/activate

# Start llama.cpp server
echo "Starting llama.cpp server..."

echo "Processing model from: $1"
MODEL_DIR="./linalg_zero/distillation/llama-cpp/models"
# Ensure models directory exists
mkdir -p "${MODEL_DIR}"

# Extract filename from URL
MODEL_NAME=$(basename "$1")
MODEL_PATH="${MODEL_DIR}/${MODEL_NAME}"

# Download model if it doesn't exist locally
if [ ! -f "${MODEL_PATH}" ]; then
    echo "Model not found locally. Downloading..."
    if curl -L -o "${MODEL_PATH}" "$1"; then
        echo "Model successfully downloaded to: ${MODEL_PATH}"
    else
        echo "Error: Failed to download model from $1"
        exit 1
    fi
else
    echo "Model already exists locally: ${MODEL_PATH}"
fi

# Set the model path and alias
ARGS="--model ${MODEL_PATH}"

# Offload all layers to GPU where possible
# The GPU-offload layers must be tuned otherwise we get errors.
echo "Setting GPU-offload layers to: ${2}"
ARGS="${ARGS} --n_gpu_layers ${2}"

# Set the split mode to row
# LLAMA_SPLIT_MODE_NONE = 0
# LLAMA_SPLIT_MODE_LAYER = 1
# LLAMA_SPLIT_MODE_ROW = 2
ARGS="${ARGS} --split_mode 2"

# Set the context limit: 0 = model default, any other value = custom
ARGS="${ARGS} --n_ctx 2048"

# Set the host and port
ARGS="${ARGS} --host 0.0.0.0 --port 8000"

# Start the server
uv run python3 -m llama_cpp.server $ARGS
sleep 10

echo "Keeping the container running..."
tail -f /dev/null