Spaces:

atomwalk12
/

linalg-zero

Sleeping

App Files Files Community

linalg-zero / linalg_zero /distillation /llama-cpp /runpod.sh

atomwalk12

initial commit

0dd6c2f about 1 month ago

raw

history blame contribute delete

2.24 kB

	#!/bin/bash
	# Script adapted from https://github.com/zeeb0tt/runpod-llm/blob/8037d1cf1ebcb8310071b9d302b036e8646f872a/2_runtime_setup_llamacpp.sh

	# Install llama.cpp Python binding
	echo "Installing llama.cpp Python binding..."
	CMAKE_ARGS="-DGGML_CUDA=on -DGGML_CUDA_FA_ALL_QUANTS=ON" pip3 install --root-user-action=ignore llama-cpp-python[server]

	# Start llama.cpp server
	echo "Starting llama.cpp server..."

	# Set the model path and alias
	MODEL_PATH="${RUNPOD_LLM_MODEL_DIR}/${RUNPOD_LLM_MODEL_FILE_NAME}"
	ARGS="--model ${MODEL_PATH} --model_alias ${RUNPOD_LLM_MODEL_ALIAS}"

	# Offload all layers to GPU where possible
	ARGS="${ARGS} --n_gpu_layers -1"

	# Enable flash attention: 1 = enabled, 0 = disabled
	if [ -n "${RUNPOD_LLM_FLASH_ATTENTION}" ]; then
	if [ "${RUNPOD_LLM_FLASH_ATTENTION}" = "1" ]; then
	ARGS="${ARGS} --flash_attn True"
	else
	ARGS="${ARGS} --flash_attn False"
	fi
	fi

	# Set the split mode to row
	# LLAMA_SPLIT_MODE_NONE = 0
	# LLAMA_SPLIT_MODE_LAYER = 1
	# LLAMA_SPLIT_MODE_ROW = 2
	ARGS="${ARGS} --split_mode 2"

	# Set the context limit: 0 = model default, any other value = custom
	if [ -n "${RUNPOD_LLM_CONTEXT_LIMIT}" ]; then
	ARGS="${ARGS} --n_ctx ${RUNPOD_LLM_CONTEXT_LIMIT}"
	fi

	# Set the quantization type
	if [ -n "${RUNPOD_LLM_CACHE_QUANTIZATION}" ]; then
	# Map quantization string to integer value
	case "${RUNPOD_LLM_CACHE_QUANTIZATION}" in
	"f32") QUANT_VALUE=0 ;;
	"f16") QUANT_VALUE=1 ;;
	"bf16") QUANT_VALUE=32 ;;
	"q8_0") QUANT_VALUE=7 ;;
	"q4_0") QUANT_VALUE=2 ;;
	"q4_1") QUANT_VALUE=3 ;;
	"iq4_nl") QUANT_VALUE=25 ;;
	"q5_0") QUANT_VALUE=8 ;;
	"q5_1") QUANT_VALUE=9 ;;
	*) QUANT_VALUE=1 ;;
	esac

	ARGS="${ARGS} --type_k ${QUANT_VALUE} --type_v ${QUANT_VALUE}"
	fi

	# Set the host and port
	ARGS="${ARGS} --host 0.0.0.0 --port 11434"

	# Start the server
	python3 -m llama_cpp.server $ARGS > /app/runtime_llamacpp.log 2>&1 &
	sleep 10

	if [ "$RUNPOD_LLM_SERVERLESS" = "1" ]; then
	echo "Starting the RunPod serverless handler..."
	python3 -u /app/3_runtime_runpod_serverless.py
	else
	echo "Keeping the container running..."
	tail -f /dev/null
	fi