Spaces:

anews9340
/

llama.cpp

Paused

App Files Files Community

anews9340 commited on Mar 2

Commit

f5b5b66

verified ·

1 Parent(s): 9543bb6

Create Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +54 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+FROM debian:bookworm-slim
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+# 1. Install build tools and CURL dependencies (needed for -hf flag)
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    libopenblas-dev \
+    libcurl4-openssl-dev \
+    libssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+# 2. Setup Hugging Face user (Required for Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# 3. Clone and Compile llama.cpp for Xeon Platinum 8375C
+# -DGGML_NATIVE=ON: Targets your specific Ice Lake instructions
+# -DGGML_AVX512 & VNNI: Uses the hardware acceleration flags found in your cpuinfo
+# -DGGML_CURL=ON: Enables the -hf downloading capability
+RUN git clone --depth 1 https://github.com/ggerganov/llama.cpp.git . && \
+    cmake -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_NATIVE=ON \
+    -DGGML_AVX512=ON \
+    -DGGML_AVX512_VNNI=ON \
+    -DGGML_OPENMP=ON \
+    -DGGML_CURL=ON && \
+    cmake --build build --config Release -j $(nproc) && \
+    cp build/bin/llama-server . && \
+    rm -rf build # Clean up build artifacts to reduce image size
+# 4. Final Server Configuration
+# -t 8: Optimized for your 8 physical cores (prevents hyperthreading slowdowns)
+# -hf: Pulls directly from Hugging Face
+# --host 0.0.0.0: Required for Hugging Face Spaces networking
+# --flash-attn: Uses AVX-512 optimized attention kernels
+ENTRYPOINT ["./llama-server"]
+CMD [ \
+    "-hf", "unsloth/Qwen3.5-9B-GGUF:Q8_0", \
+    "--host", "0.0.0.0", \
+    "--port", "7860", \
+    "-t", "8", \
+    "-c", "4096", \
+    "--flash-attn", \
+    "--no-mmap" \
+    ]