AI Engineering Lab
Initial release: TurboQuant practical guide for consumer hardware
87efc66
# TurboQuant llama.cpp — CUDA Build
# Builds llama-server with TurboQuant KV-cache quantization support
# turbo2 / turbo3 / turbo4 cache types enabled
#
# CRITICAL: Use --branch feature/turboquant-kv-cache (NOT master!)
# The master branch is a standard llama.cpp without TurboQuant support.
#
# Usage:
# docker build -t turboquant:feature .
# docker run --rm turboquant:feature llama-server -h 2>&1 | grep -A3 "cache-type-k"
# # Must show: turbo2, turbo3, turbo4
FROM nvidia/cuda:12.6.3-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y \
cmake \
build-essential \
git \
wget \
curl \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
# CRITICAL: Must use --branch feature/turboquant-kv-cache
# Default 'master' does NOT have turbo2/turbo3/turbo4 cache types!
RUN git clone https://github.com/TheTom/llama-cpp-turboquant.git \
--branch feature/turboquant-kv-cache \
--depth=1
WORKDIR /build/llama-cpp-turboquant
# Fix: libcuda.so.1 is not available at build time (driver is injected at runtime only)
# The devel image provides a stub at /usr/local/cuda/lib64/stubs/libcuda.so
# Symlink to .1 so the linker finds it during cmake build
RUN ln -sf /usr/local/cuda/lib64/stubs/libcuda.so \
/usr/local/cuda/lib64/stubs/libcuda.so.1 \
&& echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/cuda-stubs.conf \
&& ldconfig
# IMPORTANT: Use -DGGML_CUDA=ON (not -DLLAMA_CUBLAS=ON which was renamed in ~2024)
RUN cmake -B build \
-DGGML_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -j4 --target llama-server
RUN cp build/bin/llama-server /usr/local/bin/llama-server
WORKDIR /models
EXPOSE 8180
# Default: show help. Override CMD in docker run to actually serve a model.
CMD ["llama-server", "--help"]