# TurboQuant llama.cpp — CUDA Build # Builds llama-server with TurboQuant KV-cache quantization support # turbo2 / turbo3 / turbo4 cache types enabled # # CRITICAL: Use --branch feature/turboquant-kv-cache (NOT master!) # The master branch is a standard llama.cpp without TurboQuant support. # # Usage: # docker build -t turboquant:feature . # docker run --rm turboquant:feature llama-server -h 2>&1 | grep -A3 "cache-type-k" # # Must show: turbo2, turbo3, turbo4 FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y \ cmake \ build-essential \ git \ wget \ curl \ python3 \ python3-pip \ && rm -rf /var/lib/apt/lists/* WORKDIR /build # CRITICAL: Must use --branch feature/turboquant-kv-cache # Default 'master' does NOT have turbo2/turbo3/turbo4 cache types! RUN git clone https://github.com/TheTom/llama-cpp-turboquant.git \ --branch feature/turboquant-kv-cache \ --depth=1 WORKDIR /build/llama-cpp-turboquant # Fix: libcuda.so.1 is not available at build time (driver is injected at runtime only) # The devel image provides a stub at /usr/local/cuda/lib64/stubs/libcuda.so # Symlink to .1 so the linker finds it during cmake build RUN ln -sf /usr/local/cuda/lib64/stubs/libcuda.so \ /usr/local/cuda/lib64/stubs/libcuda.so.1 \ && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/cuda-stubs.conf \ && ldconfig # IMPORTANT: Use -DGGML_CUDA=ON (not -DLLAMA_CUBLAS=ON which was renamed in ~2024) RUN cmake -B build \ -DGGML_CUDA=ON \ -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -j4 --target llama-server RUN cp build/bin/llama-server /usr/local/bin/llama-server WORKDIR /models EXPOSE 8180 # Default: show help. Override CMD in docker run to actually serve a model. CMD ["llama-server", "--help"]