atomwalk12
initial commit
0dd6c2f
#!/bin/bash
# Check if model URL is provided
if [ -z "$1" ]; then
echo "Error: Model URL is required"
echo "Usage: $0 <model_url>"
echo "Example: sh $0 https://huggingface.co/Salesforce/Llama-xLAM-2-8b-fc-r-gguf/resolve/main/Llama-xLAM-2-8B-fc-r-Q4_K_M.gguf"
exit 1
fi
# Activate development environment
source ./.venv/bin/activate
# Start llama.cpp server
echo "Starting llama.cpp server..."
echo "Processing model from: $1"
MODEL_DIR="./linalg_zero/distillation/llama-cpp/models"
# Ensure models directory exists
mkdir -p "${MODEL_DIR}"
# Extract filename from URL
MODEL_NAME=$(basename "$1")
MODEL_PATH="${MODEL_DIR}/${MODEL_NAME}"
# Download model if it doesn't exist locally
if [ ! -f "${MODEL_PATH}" ]; then
echo "Model not found locally. Downloading..."
if curl -L -o "${MODEL_PATH}" "$1"; then
echo "Model successfully downloaded to: ${MODEL_PATH}"
else
echo "Error: Failed to download model from $1"
exit 1
fi
else
echo "Model already exists locally: ${MODEL_PATH}"
fi
# Set the model path and alias
ARGS="--model ${MODEL_PATH}"
# Offload all layers to GPU where possible
# The GPU-offload layers must be tuned otherwise we get errors.
echo "Setting GPU-offload layers to: ${2}"
ARGS="${ARGS} --n_gpu_layers ${2}"
# Set the split mode to row
# LLAMA_SPLIT_MODE_NONE = 0
# LLAMA_SPLIT_MODE_LAYER = 1
# LLAMA_SPLIT_MODE_ROW = 2
ARGS="${ARGS} --split_mode 2"
# Set the context limit: 0 = model default, any other value = custom
ARGS="${ARGS} --n_ctx 2048"
# Set the host and port
ARGS="${ARGS} --host 0.0.0.0 --port 8000"
# Start the server
uv run python3 -m llama_cpp.server $ARGS
sleep 10
echo "Keeping the container running..."
tail -f /dev/null