File size: 1,469 Bytes
f5b5b66 7e4778c f5b5b66 7e4778c f5b5b66 7e4778c f5b5b66 7e4778c f5b5b66 7e4778c 4cb0463 f5b5b66 c802a0e 4cb0463 c802a0e f5b5b66 c802a0e f5b5b66 c4c2b5b f5b5b66 c4c2b5b ab4bbe1 f5b5b66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | FROM debian:bookworm-slim
# 1. Install dependencies
# Added pkg-config to fix the "Could NOT find PkgConfig" error
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*
# 2. Setup Hugging Face User
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH
WORKDIR $HOME/app
# 3. Clone and Build ONLY llama-server
# Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion
RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_NATIVE=ON \
-DGGML_AVX512=ON \
-DGGML_AVX512_VNNI=ON \
-DGGML_OPENMP=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_CURL=ON && \
cmake --build build --config Release --target llama-server -j 8
# 4. Final Server Configuration
# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
# -hf: Pulls directly from Hugging Face
# --host 0.0.0.0: Required for Hugging Face Spaces networking
# --flash-attn: Uses AVX-512 optimized attention kernels
ENTRYPOINT ["./build/bin/llama-server"]
CMD [ \
"-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \
"--host", "0.0.0.0", \
"--port", "7860", \
"-t", "8", \
"-c", "4096", \
"--flash-attn", "true", \
"--no-mmap" \
] |