atomwalk12's picture
initial commit
0dd6c2f
#!/bin/bash
# Script adapted from https://github.com/zeeb0tt/runpod-llm/blob/8037d1cf1ebcb8310071b9d302b036e8646f872a/2_runtime_setup_llamacpp.sh
# Install llama.cpp Python binding
echo "Installing llama.cpp Python binding..."
CMAKE_ARGS="-DGGML_CUDA=on -DGGML_CUDA_FA_ALL_QUANTS=ON" pip3 install --root-user-action=ignore llama-cpp-python[server]
# Start llama.cpp server
echo "Starting llama.cpp server..."
# Set the model path and alias
MODEL_PATH="${RUNPOD_LLM_MODEL_DIR}/${RUNPOD_LLM_MODEL_FILE_NAME}"
ARGS="--model ${MODEL_PATH} --model_alias ${RUNPOD_LLM_MODEL_ALIAS}"
# Offload all layers to GPU where possible
ARGS="${ARGS} --n_gpu_layers -1"
# Enable flash attention: 1 = enabled, 0 = disabled
if [ -n "${RUNPOD_LLM_FLASH_ATTENTION}" ]; then
if [ "${RUNPOD_LLM_FLASH_ATTENTION}" = "1" ]; then
ARGS="${ARGS} --flash_attn True"
else
ARGS="${ARGS} --flash_attn False"
fi
fi
# Set the split mode to row
# LLAMA_SPLIT_MODE_NONE = 0
# LLAMA_SPLIT_MODE_LAYER = 1
# LLAMA_SPLIT_MODE_ROW = 2
ARGS="${ARGS} --split_mode 2"
# Set the context limit: 0 = model default, any other value = custom
if [ -n "${RUNPOD_LLM_CONTEXT_LIMIT}" ]; then
ARGS="${ARGS} --n_ctx ${RUNPOD_LLM_CONTEXT_LIMIT}"
fi
# Set the quantization type
if [ -n "${RUNPOD_LLM_CACHE_QUANTIZATION}" ]; then
# Map quantization string to integer value
case "${RUNPOD_LLM_CACHE_QUANTIZATION}" in
"f32") QUANT_VALUE=0 ;;
"f16") QUANT_VALUE=1 ;;
"bf16") QUANT_VALUE=32 ;;
"q8_0") QUANT_VALUE=7 ;;
"q4_0") QUANT_VALUE=2 ;;
"q4_1") QUANT_VALUE=3 ;;
"iq4_nl") QUANT_VALUE=25 ;;
"q5_0") QUANT_VALUE=8 ;;
"q5_1") QUANT_VALUE=9 ;;
*) QUANT_VALUE=1 ;;
esac
ARGS="${ARGS} --type_k ${QUANT_VALUE} --type_v ${QUANT_VALUE}"
fi
# Set the host and port
ARGS="${ARGS} --host 0.0.0.0 --port 11434"
# Start the server
python3 -m llama_cpp.server $ARGS > /app/runtime_llamacpp.log 2>&1 &
sleep 10
if [ "$RUNPOD_LLM_SERVERLESS" = "1" ]; then
echo "Starting the RunPod serverless handler..."
python3 -u /app/3_runtime_runpod_serverless.py
else
echo "Keeping the container running..."
tail -f /dev/null
fi