llama.cpp / Dockerfile
anews9340's picture
Update Dockerfile
ab4bbe1 verified
FROM debian:bookworm-slim
# 1. Install dependencies
# Added pkg-config to fix the "Could NOT find PkgConfig" error
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*
# 2. Setup Hugging Face User
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH
WORKDIR $HOME/app
# 3. Clone and Build ONLY llama-server
# Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion
RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_NATIVE=ON \
-DGGML_AVX512=ON \
-DGGML_AVX512_VNNI=ON \
-DGGML_OPENMP=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_CURL=ON && \
cmake --build build --config Release --target llama-server -j 8
# 4. Final Server Configuration
# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
# -hf: Pulls directly from Hugging Face
# --host 0.0.0.0: Required for Hugging Face Spaces networking
# --flash-attn: Uses AVX-512 optimized attention kernels
ENTRYPOINT ["./build/bin/llama-server"]
CMD [ \
"-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \
"--host", "0.0.0.0", \
"--port", "7860", \
"-t", "8", \
"-c", "4096", \
"--flash-attn", "true", \
"--no-mmap" \
]