# TurboQuant llama.cpp — CUDA Build
# Builds llama-server with TurboQuant KV-cache quantization support
# turbo2 / turbo3 / turbo4 cache types enabled
#
# CRITICAL: Use --branch feature/turboquant-kv-cache (NOT master!)
# The master branch is a standard llama.cpp without TurboQuant support.
#
# Usage:
#   docker build -t turboquant:feature .
#   docker run --rm turboquant:feature llama-server -h 2>&1 | grep -A3 "cache-type-k"
#   # Must show: turbo2, turbo3, turbo4

FROM nvidia/cuda:12.6.3-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y \
    cmake \
    build-essential \
    git \
    wget \
    curl \
    python3 \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /build

# CRITICAL: Must use --branch feature/turboquant-kv-cache
# Default 'master' does NOT have turbo2/turbo3/turbo4 cache types!
RUN git clone https://github.com/TheTom/llama-cpp-turboquant.git \
    --branch feature/turboquant-kv-cache \
    --depth=1

WORKDIR /build/llama-cpp-turboquant

# Fix: libcuda.so.1 is not available at build time (driver is injected at runtime only)
# The devel image provides a stub at /usr/local/cuda/lib64/stubs/libcuda.so
# Symlink to .1 so the linker finds it during cmake build
RUN ln -sf /usr/local/cuda/lib64/stubs/libcuda.so \
           /usr/local/cuda/lib64/stubs/libcuda.so.1 \
    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/cuda-stubs.conf \
    && ldconfig

# IMPORTANT: Use -DGGML_CUDA=ON (not -DLLAMA_CUBLAS=ON which was renamed in ~2024)
RUN cmake -B build \
    -DGGML_CUDA=ON \
    -DCMAKE_BUILD_TYPE=Release \
    && cmake --build build --config Release -j4 --target llama-server

RUN cp build/bin/llama-server /usr/local/bin/llama-server

WORKDIR /models
EXPOSE 8180

# Default: show help. Override CMD in docker run to actually serve a model.
CMD ["llama-server", "--help"]