Spaces:

anews9340
/

llama.cpp

Paused

llama.cpp / Dockerfile

Update Dockerfile

ab4bbe1 verified 2 months ago

1.47 kB

	FROM debian:bookworm-slim

	# 1. Install dependencies
	# Added pkg-config to fix the "Could NOT find PkgConfig" error
	RUN apt-get update && apt-get install -y \
	build-essential \
	cmake \
	git \
	pkg-config \
	libcurl4-openssl-dev \
	libssl-dev \
	libopenblas-dev \
	&& rm -rf /var/lib/apt/lists/*

	# 2. Setup Hugging Face User
	RUN useradd -m -u 1000 user
	USER user
	ENV HOME=/home/user \
	PATH=/home/user/.local/bin:$PATH
	WORKDIR $HOME/app

	# 3. Clone and Build ONLY llama-server
	# Using -j 2 to prevent the 54% hang issue caused by RAM exhaustion
	RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Release \
	-DGGML_NATIVE=ON \
	-DGGML_AVX512=ON \
	-DGGML_AVX512_VNNI=ON \
	-DGGML_OPENMP=ON \
	-DGGML_BLAS=ON \
	-DGGML_BLAS_VENDOR=OpenBLAS \
	-DGGML_CURL=ON && \
	cmake --build build --config Release --target llama-server -j 8





	# 4. Final Server Configuration
	# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
	# -hf: Pulls directly from Hugging Face
	# --host 0.0.0.0: Required for Hugging Face Spaces networking
	# --flash-attn: Uses AVX-512 optimized attention kernels
	ENTRYPOINT ["./build/bin/llama-server"]

	CMD [ \
	"-hf", "unsloth/Qwen3.5-4B-GGUF:Q8_0", \
	"--host", "0.0.0.0", \
	"--port", "7860", \
	"-t", "8", \
	"-c", "4096", \
	"--flash-attn", "true", \
	"--no-mmap" \
	]