FROM debian:bookworm-slim # 1. Install dependencies # Added pkg-config to fix the "Could NOT find PkgConfig" error RUN apt-get update && apt-get install -y \ build-essential \ cmake \ git \ pkg-config \ libcurl4-openssl-dev \ libssl-dev \ libopenblas-dev \ && rm -rf /var/lib/apt/lists/* # 2. Setup Hugging Face User RUN useradd -m -u 1000 user USER user ENV HOME=/home/user \ PATH=/home/user/.local/bin:$PATH WORKDIR $HOME/app # 3. Clone and Build ONLY llama-server # Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \ cmake -B build \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_NATIVE=ON \ -DGGML_AVX512=ON \ -DGGML_AVX512_VNNI=ON \ -DGGML_OPENMP=ON \ -DGGML_BLAS=ON \ -DGGML_BLAS_VENDOR=OpenBLAS \ -DGGML_CURL=ON && \ cmake --build build --config Release --target llama-server -j 8 # 4. Final Server Configuration # -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns) # -hf: Pulls directly from Hugging Face # --host 0.0.0.0: Required for Hugging Face Spaces networking # --flash-attn: Uses AVX-512 optimized attention kernels ENTRYPOINT ["./build/bin/llama-server"] CMD [ \ "-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \ "--host", "0.0.0.0", \ "--port", "7860", \ "-t", "8", \ "-c", "4096", \ "--flash-attn", "true", \ "--no-mmap" \ ]